diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c index 761734674bdf..a0967d044a96 100644 --- a/sys/kern/kern_environment.c +++ b/sys/kern/kern_environment.c @@ -1,1158 +1,1161 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1998 Michael Smith * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The unified bootloader passes us a pointer to a preserved copy of * bootstrap/kernel environment variables. We convert them to a * dynamic array of strings later when the VM subsystem is up. * * We make these available through the kenv(2) syscall for userland * and through kern_getenv()/freeenv() kern_setenv() kern_unsetenv() testenv() for * the kernel. */ #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static char *_getenv_dynamic_locked(const char *name, int *idx); static char *_getenv_dynamic(const char *name, int *idx); static char *kenv_acquire(const char *name); static void kenv_release(const char *buf); static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment"); #define KENV_SIZE 512 /* Maximum number of environment strings */ static uma_zone_t kenv_zone; static int kenv_mvallen = KENV_MVALLEN; /* pointer to the config-generated static environment */ char *kern_envp; /* pointer to the md-static environment */ char *md_envp; static int md_env_len; static int md_env_pos; static char *kernenv_next(char *); /* dynamic environment variables */ char **kenvp; struct mtx kenv_lock; /* * No need to protect this with a mutex since SYSINITS are single threaded. */ bool dynamic_kenv; #define KENV_CHECK if (!dynamic_kenv) \ panic("%s: called before SI_SUB_KMEM", __func__) static int kenv_dump(struct thread *td, char **envp, int what, char *value, int len) { char *buffer, *senv; size_t done, needed, buflen; int error; error = 0; buffer = NULL; done = needed = 0; MPASS(what == KENV_DUMP || what == KENV_DUMP_LOADER || what == KENV_DUMP_STATIC); /* * For non-dynamic kernel environment, we pass in either md_envp or * kern_envp and we must traverse with kernenv_next(). This shuffling * of pointers simplifies the below loop by only differing in how envp * is modified. */ if (what != KENV_DUMP) { senv = (char *)envp; envp = &senv; } buflen = len; if (buflen > KENV_SIZE * (KENV_MNAMELEN + kenv_mvallen + 2)) buflen = KENV_SIZE * (KENV_MNAMELEN + kenv_mvallen + 2); if (len > 0 && value != NULL) buffer = malloc(buflen, M_TEMP, M_WAITOK|M_ZERO); /* Only take the lock for the dynamic kenv. */ if (what == KENV_DUMP) mtx_lock(&kenv_lock); while (*envp != NULL) { len = strlen(*envp) + 1; needed += len; len = min(len, buflen - done); /* * If called with a NULL or insufficiently large * buffer, just keep computing the required size. */ if (value != NULL && buffer != NULL && len > 0) { bcopy(*envp, buffer + done, len); done += len; } /* Advance the pointer depending on the kenv format. */ if (what == KENV_DUMP) envp++; else senv = kernenv_next(senv); } if (what == KENV_DUMP) mtx_unlock(&kenv_lock); if (buffer != NULL) { error = copyout(buffer, value, done); free(buffer, M_TEMP); } td->td_retval[0] = ((done == needed) ? 0 : needed); return (error); } int sys_kenv(struct thread *td, struct kenv_args *uap) { char *name, *value; size_t len; int error; KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = false")); error = 0; switch (uap->what) { case KENV_DUMP: #ifdef MAC error = mac_kenv_check_dump(td->td_ucred); if (error) return (error); #endif return (kenv_dump(td, kenvp, uap->what, uap->value, uap->len)); case KENV_DUMP_LOADER: case KENV_DUMP_STATIC: #ifdef MAC error = mac_kenv_check_dump(td->td_ucred); if (error) return (error); #endif #ifdef PRESERVE_EARLY_KENV return (kenv_dump(td, uap->what == KENV_DUMP_LOADER ? (char **)md_envp : (char **)kern_envp, uap->what, uap->value, uap->len)); #else return (ENOENT); #endif case KENV_SET: error = priv_check(td, PRIV_KENV_SET); if (error) return (error); break; case KENV_UNSET: error = priv_check(td, PRIV_KENV_UNSET); if (error) return (error); break; } name = malloc(KENV_MNAMELEN + 1, M_TEMP, M_WAITOK); error = copyinstr(uap->name, name, KENV_MNAMELEN + 1, NULL); if (error) goto done; switch (uap->what) { case KENV_GET: #ifdef MAC error = mac_kenv_check_get(td->td_ucred, name); if (error) goto done; #endif value = kern_getenv(name); if (value == NULL) { error = ENOENT; goto done; } len = strlen(value) + 1; if (len > uap->len) len = uap->len; error = copyout(value, uap->value, len); freeenv(value); if (error) goto done; td->td_retval[0] = len; break; case KENV_SET: len = uap->len; if (len < 1) { error = EINVAL; goto done; } if (len > kenv_mvallen + 1) len = kenv_mvallen + 1; value = malloc(len, M_TEMP, M_WAITOK); error = copyinstr(uap->value, value, len, NULL); if (error) { free(value, M_TEMP); goto done; } #ifdef MAC error = mac_kenv_check_set(td->td_ucred, name, value); if (error == 0) #endif kern_setenv(name, value); free(value, M_TEMP); break; case KENV_UNSET: #ifdef MAC error = mac_kenv_check_unset(td->td_ucred, name); if (error) goto done; #endif error = kern_unsetenv(name); if (error) error = ENOENT; break; default: error = EINVAL; break; } done: free(name, M_TEMP); return (error); } /* * Populate the initial kernel environment. * * This is called very early in MD startup, either to provide a copy of the * environment obtained from a boot loader, or to provide an empty buffer into * which MD code can store an initial environment using kern_setenv() calls. * * kern_envp is set to the static_env generated by config(8). This implements * the env keyword described in config(5). * * If len is non-zero, the caller is providing an empty buffer. The caller will * subsequently use kern_setenv() to add up to len bytes of initial environment * before the dynamic environment is available. * * If len is zero, the caller is providing a pre-loaded buffer containing * environment strings. Additional strings cannot be added until the dynamic * environment is available. The memory pointed to must remain stable at least * until sysinit runs init_dynamic_kenv() and preferably until after SI_SUB_KMEM * is finished so that subr_hints routines may continue to use it until the * environments have been fully merged at the end of the pass. If no initial * environment is available from the boot loader, passing a NULL pointer allows * the static_env to be installed if it is configured. In this case, any call * to kern_setenv() prior to the setup of the dynamic environment will result in * a panic. */ void init_static_kenv(char *buf, size_t len) { KASSERT(!dynamic_kenv, ("kenv: dynamic_kenv already initialized")); /* * Suitably sized means it must be able to hold at least one empty * variable, otherwise things go belly up if a kern_getenv call is * made without a prior call to kern_setenv as we have a malformed * environment. */ KASSERT(len == 0 || len >= 2, ("kenv: static env must be initialized or suitably sized")); KASSERT(len == 0 || (*buf == '\0' && *(buf + 1) == '\0'), ("kenv: sized buffer must be initially empty")); /* * We may be called twice, with the second call needed to relocate * md_envp after enabling paging. md_envp is then garbage if it is * not null and the relocation will move it. Discard it so as to * not crash using its old value in our first call to kern_getenv(). * * The second call gives the same environment as the first except * in silly configurations where the static env disables itself. * * Other env calls don't handle possibly-garbage pointers, so must * not be made between enabling paging and calling here. */ md_envp = NULL; md_env_len = 0; md_env_pos = 0; /* * Give the static environment a chance to disable the loader(8) * environment first. This is done with loader_env.disabled=1. * * static_env and static_hints may both be disabled, but in slightly * different ways. For static_env, we just don't setup kern_envp and * it's as if a static env wasn't even provided. For static_hints, * we effectively zero out the buffer to stop the rest of the kernel * from being able to use it. * * We're intentionally setting this up so that static_hints.disabled may * be specified in either the MD env or the static env. This keeps us * consistent in our new world view. * * As a warning, the static environment may not be disabled in any way * if the static environment has disabled the loader environment. */ kern_envp = static_env; if (!getenv_is_true("loader_env.disabled")) { md_envp = buf; md_env_len = len; md_env_pos = 0; if (getenv_is_true("static_env.disabled")) { kern_envp[0] = '\0'; kern_envp[1] = '\0'; } } if (getenv_is_true("static_hints.disabled")) { static_hints[0] = '\0'; static_hints[1] = '\0'; } } /* Maximum suffix number appended for duplicate environment variable names. */ #define MAXSUFFIX 9999 #define SUFFIXLEN strlen("_" __XSTRING(MAXSUFFIX)) static void getfreesuffix(char *cp, size_t *n) { size_t len = strlen(cp); char * ncp; ncp = malloc(len + SUFFIXLEN + 1, M_KENV, M_WAITOK); memcpy(ncp, cp, len); for (*n = 1; *n <= MAXSUFFIX; (*n)++) { sprintf(&ncp[len], "_%zu", *n); if (!_getenv_dynamic_locked(ncp, NULL)) break; } free(ncp, M_KENV); if (*n > MAXSUFFIX) panic("Too many duplicate kernel environment values: %s", cp); } static void init_dynamic_kenv_from(char *init_env, int *curpos) { char *cp, *cpnext, *eqpos, *found; size_t len, n; int i; if (init_env && *init_env != '\0') { found = NULL; i = *curpos; for (cp = init_env; cp != NULL; cp = cpnext) { cpnext = kernenv_next(cp); len = strlen(cp) + 1; if (i > KENV_SIZE) { printf( "WARNING: too many kenv strings, ignoring %s\n", cp); goto sanitize; } if (len > KENV_MNAMELEN + 1 + kenv_mvallen + 1) { printf( "WARNING: too long kenv string, ignoring %s\n", cp); goto sanitize; } eqpos = strchr(cp, '='); if (eqpos == NULL) { printf( "WARNING: malformed static env value, ignoring %s\n", cp); goto sanitize; } *eqpos = 0; /* * Handle duplicates in the environment as we go; we * add the duplicated assignments with _N suffixes. * This ensures that (a) if a variable is set in the * static environment and in the "loader" environment * provided by MD code, the value from the loader will * have the expected variable name and the value from * the static environment will have the suffix; and (b) * if the "loader" environment has the same variable * set multiple times (as is possible with values being * passed via the kernel "command line") the extra * values are visible to code which knows where to look * for them. */ found = _getenv_dynamic_locked(cp, NULL); if (found != NULL) { getfreesuffix(cp, &n); kenvp[i] = malloc(len + SUFFIXLEN, M_KENV, M_WAITOK); sprintf(kenvp[i++], "%s_%zu=%s", cp, n, &eqpos[1]); } else { kenvp[i] = malloc(len, M_KENV, M_WAITOK); *eqpos = '='; strcpy(kenvp[i++], cp); } sanitize: #ifdef PRESERVE_EARLY_KENV continue; #else explicit_bzero(cp, len - 1); #endif } *curpos = i; } } /* * Setup the dynamic kernel environment. */ static void init_dynamic_kenv(void *data __unused) { int dynamic_envpos; int size; TUNABLE_INT_FETCH("kenv_mvallen", &kenv_mvallen); size = KENV_MNAMELEN + 1 + kenv_mvallen + 1; kenv_zone = uma_zcreate("kenv", size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV, M_WAITOK | M_ZERO); dynamic_envpos = 0; init_dynamic_kenv_from(md_envp, &dynamic_envpos); init_dynamic_kenv_from(kern_envp, &dynamic_envpos); kenvp[dynamic_envpos] = NULL; mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF); dynamic_kenv = true; } SYSINIT(kenv, SI_SUB_KMEM + 1, SI_ORDER_FIRST, init_dynamic_kenv, NULL); void freeenv(char *env) { if (dynamic_kenv && env != NULL) { explicit_bzero(env, strlen(env)); uma_zfree(kenv_zone, env); } } /* * Internal functions for string lookup. */ static char * _getenv_dynamic_locked(const char *name, int *idx) { char *cp; int len, i; len = strlen(name); for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) { if ((strncmp(cp, name, len) == 0) && (cp[len] == '=')) { if (idx != NULL) *idx = i; return (cp + len + 1); } } return (NULL); } static char * _getenv_dynamic(const char *name, int *idx) { mtx_assert(&kenv_lock, MA_OWNED); return (_getenv_dynamic_locked(name, idx)); } static char * _getenv_static_from(char *chkenv, const char *name) { char *cp, *ep; int len; for (cp = chkenv; cp != NULL; cp = kernenv_next(cp)) { for (ep = cp; (*ep != '=') && (*ep != 0); ep++) ; if (*ep != '=') continue; len = ep - cp; ep++; if (!strncmp(name, cp, len) && name[len] == 0) return (ep); } return (NULL); } static char * _getenv_static(const char *name) { char *val; val = _getenv_static_from(md_envp, name); if (val != NULL) return (val); val = _getenv_static_from(kern_envp, name); if (val != NULL) return (val); return (NULL); } /* * Look up an environment variable by name. * Return a pointer to the string if found. * The pointer has to be freed with freeenv() * after use. */ char * kern_getenv(const char *name) { char *cp, *ret; int len; if (dynamic_kenv) { len = KENV_MNAMELEN + 1 + kenv_mvallen + 1; ret = uma_zalloc(kenv_zone, M_WAITOK | M_ZERO); mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, NULL); if (cp != NULL) strlcpy(ret, cp, len); mtx_unlock(&kenv_lock); if (cp == NULL) { uma_zfree(kenv_zone, ret); ret = NULL; } } else ret = _getenv_static(name); return (ret); } /* * Test if an environment variable is defined. */ int testenv(const char *name) { char *cp; cp = kenv_acquire(name); kenv_release(cp); if (cp != NULL) return (1); return (0); } /* * Set an environment variable in the MD-static environment. This cannot * feasibly be done on config(8)-generated static environments as they don't * generally include space for extra variables. */ static int setenv_static(const char *name, const char *value) { int len; if (md_env_pos >= md_env_len) return (-1); /* Check space for x=y and two nuls */ len = strlen(name) + strlen(value); if (len + 3 < md_env_len - md_env_pos) { len = sprintf(&md_envp[md_env_pos], "%s=%s", name, value); md_env_pos += len+1; md_envp[md_env_pos] = '\0'; return (0); } else return (-1); } /* * Set an environment variable by name. */ int kern_setenv(const char *name, const char *value) { char *buf, *cp, *oldenv; int namelen, vallen, i; if (!dynamic_kenv && md_env_len > 0) return (setenv_static(name, value)); KENV_CHECK; namelen = strlen(name) + 1; if (namelen > KENV_MNAMELEN + 1) return (-1); vallen = strlen(value) + 1; if (vallen > kenv_mvallen + 1) return (-1); buf = malloc(namelen + vallen, M_KENV, M_WAITOK); sprintf(buf, "%s=%s", name, value); mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, &i); if (cp != NULL) { oldenv = kenvp[i]; kenvp[i] = buf; mtx_unlock(&kenv_lock); free(oldenv, M_KENV); } else { /* We add the option if it wasn't found */ for (i = 0; (cp = kenvp[i]) != NULL; i++) ; /* Bounds checking */ if (i < 0 || i >= KENV_SIZE) { free(buf, M_KENV); mtx_unlock(&kenv_lock); return (-1); } kenvp[i] = buf; kenvp[i + 1] = NULL; mtx_unlock(&kenv_lock); } + EVENTHANDLER_INVOKE(setenv, name); return (0); } /* * Unset an environment variable string. */ int kern_unsetenv(const char *name) { char *cp, *oldenv; int i, j; KENV_CHECK; mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, &i); if (cp != NULL) { oldenv = kenvp[i]; for (j = i + 1; kenvp[j] != NULL; j++) kenvp[i++] = kenvp[j]; kenvp[i] = NULL; mtx_unlock(&kenv_lock); zfree(oldenv, M_KENV); + EVENTHANDLER_INVOKE(unsetenv, name); return (0); } mtx_unlock(&kenv_lock); return (-1); } /* * Return the internal kenv buffer for the variable name, if it exists. * If the dynamic kenv is initialized and the name is present, return * with kenv_lock held. */ static char * kenv_acquire(const char *name) { char *value; if (dynamic_kenv) { mtx_lock(&kenv_lock); value = _getenv_dynamic(name, NULL); if (value == NULL) mtx_unlock(&kenv_lock); return (value); } else return (_getenv_static(name)); } /* * Undo a previous kenv_acquire() operation */ static void kenv_release(const char *buf) { if ((buf != NULL) && dynamic_kenv) mtx_unlock(&kenv_lock); } /* * Return a string value from an environment variable. */ int getenv_string(const char *name, char *data, int size) { char *cp; cp = kenv_acquire(name); if (cp != NULL) strlcpy(data, cp, size); kenv_release(cp); return (cp != NULL); } /* * Return an array of integers at the given type size and signedness. */ int getenv_array(const char *name, void *pdata, int size, int *psize, int type_size, bool allow_signed) { uint8_t shift; int64_t value; int64_t old; const char *buf; char *end; const char *ptr; int n; int rc; rc = 0; /* assume failure */ buf = kenv_acquire(name); if (buf == NULL) goto error; /* get maximum number of elements */ size /= type_size; n = 0; for (ptr = buf; *ptr != 0; ) { value = strtoq(ptr, &end, 0); /* check if signed numbers are allowed */ if (value < 0 && !allow_signed) goto error; /* check for invalid value */ if (ptr == end) goto error; /* check for valid suffix */ switch (*end) { case 't': case 'T': shift = 40; end++; break; case 'g': case 'G': shift = 30; end++; break; case 'm': case 'M': shift = 20; end++; break; case 'k': case 'K': shift = 10; end++; break; case ' ': case '\t': case ',': case 0: shift = 0; break; default: /* garbage after numeric value */ goto error; } /* skip till next value, if any */ while (*end == '\t' || *end == ',' || *end == ' ') end++; /* update pointer */ ptr = end; /* apply shift */ old = value; value <<= shift; /* overflow check */ if ((value >> shift) != old) goto error; /* check for buffer overflow */ if (n >= size) goto error; /* store value according to type size */ switch (type_size) { case 1: if (allow_signed) { if (value < SCHAR_MIN || value > SCHAR_MAX) goto error; } else { if (value < 0 || value > UCHAR_MAX) goto error; } ((uint8_t *)pdata)[n] = (uint8_t)value; break; case 2: if (allow_signed) { if (value < SHRT_MIN || value > SHRT_MAX) goto error; } else { if (value < 0 || value > USHRT_MAX) goto error; } ((uint16_t *)pdata)[n] = (uint16_t)value; break; case 4: if (allow_signed) { if (value < INT_MIN || value > INT_MAX) goto error; } else { if (value > UINT_MAX) goto error; } ((uint32_t *)pdata)[n] = (uint32_t)value; break; case 8: ((uint64_t *)pdata)[n] = (uint64_t)value; break; default: goto error; } n++; } *psize = n * type_size; if (n != 0) rc = 1; /* success */ error: kenv_release(buf); return (rc); } /* * Return an integer value from an environment variable. */ int getenv_int(const char *name, int *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (int) tmp; return (rval); } /* * Return an unsigned integer value from an environment variable. */ int getenv_uint(const char *name, unsigned int *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (unsigned int) tmp; return (rval); } /* * Return an int64_t value from an environment variable. */ int getenv_int64(const char *name, int64_t *data) { quad_t tmp; int64_t rval; rval = getenv_quad(name, &tmp); if (rval) *data = (int64_t) tmp; return (rval); } /* * Return an uint64_t value from an environment variable. */ int getenv_uint64(const char *name, uint64_t *data) { quad_t tmp; uint64_t rval; rval = getenv_quad(name, &tmp); if (rval) *data = (uint64_t) tmp; return (rval); } /* * Return a long value from an environment variable. */ int getenv_long(const char *name, long *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (long) tmp; return (rval); } /* * Return an unsigned long value from an environment variable. */ int getenv_ulong(const char *name, unsigned long *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (unsigned long) tmp; return (rval); } /* * Return a quad_t value from an environment variable. */ int getenv_quad(const char *name, quad_t *data) { const char *value; char suffix, *vtp; quad_t iv; value = kenv_acquire(name); if (value == NULL) { goto error; } iv = strtoq(value, &vtp, 0); if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0')) { goto error; } suffix = vtp[0]; kenv_release(value); switch (suffix) { case 't': case 'T': iv *= 1024; /* FALLTHROUGH */ case 'g': case 'G': iv *= 1024; /* FALLTHROUGH */ case 'm': case 'M': iv *= 1024; /* FALLTHROUGH */ case 'k': case 'K': iv *= 1024; case '\0': break; default: return (0); } *data = iv; return (1); error: kenv_release(value); return (0); } /* * Return a boolean value from an environment variable. This can be in * numerical or string form, i.e. "1" or "true". */ int getenv_bool(const char *name, bool *data) { char *val; int ret = 0; if (name == NULL) return (0); val = kern_getenv(name); if (val == NULL) return (0); if ((strcmp(val, "1") == 0) || (strcasecmp(val, "true") == 0)) { *data = true; ret = 1; } else if ((strcmp(val, "0") == 0) || (strcasecmp(val, "false") == 0)) { *data = false; ret = 1; } else { /* Spit out a warning for malformed boolean variables. */ printf("Environment variable %s has non-boolean value \"%s\"\n", name, val); } freeenv(val); return (ret); } /* * Wrapper around getenv_bool to easily check for true. */ bool getenv_is_true(const char *name) { bool val; if (getenv_bool(name, &val) != 0) return (val); return (false); } /* * Wrapper around getenv_bool to easily check for false. */ bool getenv_is_false(const char *name) { bool val; if (getenv_bool(name, &val) != 0) return (!val); return (false); } /* * Find the next entry after the one which (cp) falls within, return a * pointer to its start or NULL if there are no more. */ static char * kernenv_next(char *cp) { if (cp != NULL) { while (*cp != 0) cp++; cp++; if (*cp == 0) cp = NULL; } return (cp); } void tunable_int_init(void *data) { struct tunable_int *d = (struct tunable_int *)data; TUNABLE_INT_FETCH(d->path, d->var); } void tunable_long_init(void *data) { struct tunable_long *d = (struct tunable_long *)data; TUNABLE_LONG_FETCH(d->path, d->var); } void tunable_ulong_init(void *data) { struct tunable_ulong *d = (struct tunable_ulong *)data; TUNABLE_ULONG_FETCH(d->path, d->var); } void tunable_int64_init(void *data) { struct tunable_int64 *d = (struct tunable_int64 *)data; TUNABLE_INT64_FETCH(d->path, d->var); } void tunable_uint64_init(void *data) { struct tunable_uint64 *d = (struct tunable_uint64 *)data; TUNABLE_UINT64_FETCH(d->path, d->var); } void tunable_quad_init(void *data) { struct tunable_quad *d = (struct tunable_quad *)data; TUNABLE_QUAD_FETCH(d->path, d->var); } void tunable_bool_init(void *data) { struct tunable_bool *d = (struct tunable_bool *)data; TUNABLE_BOOL_FETCH(d->path, d->var); } void tunable_str_init(void *data) { struct tunable_str *d = (struct tunable_str *)data; TUNABLE_STR_FETCH(d->path, d->var, d->size); } diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c index a1d502d58bff..780eb6099b07 100644 --- a/sys/kern/kern_sysctl.c +++ b/sys/kern/kern_sysctl.c @@ -1,3016 +1,3119 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD * project, to make these variables more userfriendly. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 */ #include #include "opt_capsicum.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include "opt_sysctl.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef DDB #include #include #endif #include #include #include #include static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic"); static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids"); static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer"); RB_GENERATE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid); /* * The sysctllock protects the MIB tree. It also protects sysctl * contexts used with dynamic sysctls. The sysctl_register_oid() and * sysctl_unregister_oid() routines require the sysctllock to already * be held, so the sysctl_wlock() and sysctl_wunlock() routines are * provided for the few places in the kernel which need to use that * API rather than using the dynamic API. Use of the dynamic API is * strongly encouraged for most code. * * The sysctlmemlock is used to limit the amount of user memory wired for * sysctl requests. This is implemented by serializing any userland * sysctl requests larger than a single page via an exclusive lock. * * The sysctlstringlock is used to protect concurrent access to writable * string nodes in sysctl_handle_string(). */ static struct rmlock sysctllock; static struct sx __exclusive_cache_line sysctlmemlock; static struct sx sysctlstringlock; #define SYSCTL_WLOCK() rm_wlock(&sysctllock) #define SYSCTL_WUNLOCK() rm_wunlock(&sysctllock) #define SYSCTL_RLOCK(tracker) rm_rlock(&sysctllock, (tracker)) #define SYSCTL_RUNLOCK(tracker) rm_runlock(&sysctllock, (tracker)) #define SYSCTL_WLOCKED() rm_wowned(&sysctllock) #define SYSCTL_ASSERT_LOCKED() rm_assert(&sysctllock, RA_LOCKED) #define SYSCTL_ASSERT_WLOCKED() rm_assert(&sysctllock, RA_WLOCKED) #define SYSCTL_ASSERT_RLOCKED() rm_assert(&sysctllock, RA_RLOCKED) #define SYSCTL_INIT() rm_init_flags(&sysctllock, "sysctl lock", \ RM_SLEEPABLE) #define SYSCTL_SLEEP(ch, wmesg, timo) \ rm_sleep(ch, &sysctllock, 0, wmesg, timo) static int sysctl_root(SYSCTL_HANDLER_ARGS); /* Root list */ struct sysctl_oid_list sysctl__children = RB_INITIALIZER(&sysctl__children); static char* sysctl_escape_name(const char*); static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse); static int sysctl_old_kernel(struct sysctl_req *, const void *, size_t); static int sysctl_new_kernel(struct sysctl_req *, void *, size_t); +static int name2oid(char *, int *, int *, struct sysctl_oid **); static struct sysctl_oid * sysctl_find_oidname(const char *name, struct sysctl_oid_list *list) { struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SYSCTL_FOREACH(oidp, list) { if (strcmp(oidp->oid_name, name) == 0) { return (oidp); } } return (NULL); } /* * Initialization of the MIB tree. * * Order by number in each list. */ void sysctl_wlock(void) { SYSCTL_WLOCK(); } void sysctl_wunlock(void) { SYSCTL_WUNLOCK(); } static int sysctl_root_handler_locked(struct sysctl_oid *oid, void *arg1, intmax_t arg2, struct sysctl_req *req, struct rm_priotracker *tracker) { int error; if (oid->oid_kind & CTLFLAG_DYN) atomic_add_int(&oid->oid_running, 1); if (tracker != NULL) SYSCTL_RUNLOCK(tracker); else SYSCTL_WUNLOCK(); /* * Treat set CTLFLAG_NEEDGIANT and unset CTLFLAG_MPSAFE flags the same, * untill we're ready to remove all traces of Giant from sysctl(9). */ if ((oid->oid_kind & CTLFLAG_NEEDGIANT) || (!(oid->oid_kind & CTLFLAG_MPSAFE))) mtx_lock(&Giant); error = oid->oid_handler(oid, arg1, arg2, req); if ((oid->oid_kind & CTLFLAG_NEEDGIANT) || (!(oid->oid_kind & CTLFLAG_MPSAFE))) mtx_unlock(&Giant); KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error); if (tracker != NULL) SYSCTL_RLOCK(tracker); else SYSCTL_WLOCK(); if (oid->oid_kind & CTLFLAG_DYN) { if (atomic_fetchadd_int(&oid->oid_running, -1) == 1 && (oid->oid_kind & CTLFLAG_DYING) != 0) wakeup(&oid->oid_running); } return (error); } static void sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp) { struct sysctl_req req; struct sysctl_oid *curr; char *penv = NULL; char path[96]; ssize_t rem = sizeof(path); ssize_t len; uint8_t data[512] __aligned(sizeof(uint64_t)); int size; int error; path[--rem] = 0; for (curr = oidp; curr != NULL; curr = SYSCTL_PARENT(curr)) { len = strlen(curr->oid_name); rem -= len; if (curr != oidp) rem -= 1; if (rem < 0) { printf("OID path exceeds %d bytes\n", (int)sizeof(path)); return; } memcpy(path + rem, curr->oid_name, len); if (curr != oidp) path[rem + len] = '.'; } memset(&req, 0, sizeof(req)); req.td = curthread; req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_INT: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_UINT: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_LONG: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(long), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_ULONG: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(long), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S8: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int8_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S16: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int16_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S32: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int32_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S64: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int64_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U8: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint8_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U16: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint16_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U32: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint32_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U64: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint64_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_STRING: penv = kern_getenv(path + rem); if (penv == NULL) return; req.newlen = strlen(penv); req.newptr = penv; break; default: return; } error = sysctl_root_handler_locked(oidp, oidp->oid_arg1, oidp->oid_arg2, &req, NULL); if (error != 0) printf("Setting sysctl %s failed: %d\n", path + rem, error); if (penv != NULL) freeenv(penv); } /* * Locate the path to a given oid. Returns the length of the resulting path, * or -1 if the oid was not found. nodes must have room for CTL_MAXNAME * elements. */ static int sysctl_search_oid(struct sysctl_oid **nodes, struct sysctl_oid *needle) { int indx; SYSCTL_ASSERT_LOCKED(); indx = 0; /* * Do a depth-first search of the oid tree, looking for 'needle'. Start * with the first child of the root. */ nodes[indx] = RB_MIN(sysctl_oid_list, &sysctl__children); for (;;) { if (nodes[indx] == needle) return (indx + 1); if (nodes[indx] == NULL) { /* Node has no more siblings, so back up to parent. */ if (indx-- == 0) { /* Retreat to root, so give up. */ break; } } else if ((nodes[indx]->oid_kind & CTLTYPE) == CTLTYPE_NODE) { /* Node has children. */ if (++indx == CTL_MAXNAME) { /* Max search depth reached, so give up. */ break; } /* Start with the first child. */ nodes[indx] = RB_MIN(sysctl_oid_list, &nodes[indx - 1]->oid_children); continue; } /* Consider next sibling. */ nodes[indx] = RB_NEXT(sysctl_oid_list, NULL, nodes[indx]); } return (-1); } static void sysctl_warn_reuse(const char *func, struct sysctl_oid *leaf) { struct sysctl_oid *nodes[CTL_MAXNAME]; char buf[128]; struct sbuf sb; int rc, i; (void)sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); sbuf_printf(&sb, "%s: can't re-use a leaf (", __func__); rc = sysctl_search_oid(nodes, leaf); if (rc > 0) { for (i = 0; i < rc; i++) sbuf_printf(&sb, "%s%.*s", nodes[i]->oid_name, i != (rc - 1), "."); } else { sbuf_printf(&sb, "%s", leaf->oid_name); } sbuf_printf(&sb, ")!\n"); (void)sbuf_finish(&sb); } #ifdef SYSCTL_DEBUG static int sysctl_reuse_test(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; SYSCTL_RLOCK(&tracker); sysctl_warn_reuse(__func__, oidp); SYSCTL_RUNLOCK(&tracker); return (0); } SYSCTL_PROC(_sysctl, OID_AUTO, reuse_test, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_reuse_test, "-", ""); #endif void sysctl_register_oid(struct sysctl_oid *oidp) { struct sysctl_oid_list *parent = oidp->oid_parent; struct sysctl_oid *p, key; int oid_number; int timeout = 2; /* * First check if another oid with the same name already * exists in the parent's list. */ SYSCTL_ASSERT_WLOCKED(); p = sysctl_find_oidname(oidp->oid_name, parent); if (p != NULL) { if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) { p->oid_refcnt++; return; } else { sysctl_warn_reuse(__func__, p); return; } } /* get current OID number */ oid_number = oidp->oid_number; #if (OID_AUTO >= 0) #error "OID_AUTO is expected to be a negative value" #endif /* * Any negative OID number qualifies as OID_AUTO. Valid OID * numbers should always be positive. * * NOTE: DO NOT change the starting value here, change it in * , and make sure it is at least 256 to * accommodate e.g. net.inet.raw as a static sysctl node. */ if (oid_number < 0) { static int newoid; /* * By decrementing the next OID number we spend less * time inserting the OIDs into a sorted list. */ if (--newoid < CTL_AUTO_START) newoid = 0x7fffffff; oid_number = newoid; } /* * Insert the OID into the parent's list sorted by OID number. */ key.oid_number = oid_number; p = RB_NFIND(sysctl_oid_list, parent, &key); while (p != NULL && oid_number == p->oid_number) { /* get the next valid OID number */ if (oid_number < CTL_AUTO_START || oid_number == 0x7fffffff) { /* wraparound - restart */ oid_number = CTL_AUTO_START; /* don't loop forever */ if (!timeout--) panic("sysctl: Out of OID numbers\n"); key.oid_number = oid_number; p = RB_NFIND(sysctl_oid_list, parent, &key); continue; } p = RB_NEXT(sysctl_oid_list, NULL, p); oid_number++; } /* check for non-auto OID number collision */ if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START && oid_number >= CTL_AUTO_START) { printf("sysctl: OID number(%d) is already in use for '%s'\n", oidp->oid_number, oidp->oid_name); } /* update the OID number, if any */ oidp->oid_number = oid_number; RB_INSERT(sysctl_oid_list, parent, oidp); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && (oidp->oid_kind & CTLFLAG_TUN) != 0 && (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { - /* only fetch value once */ - oidp->oid_kind |= CTLFLAG_NOFETCH; +#ifdef VIMAGE + /* + * Can fetch value multiple times for VNET loader tunables. + * Only fetch once for non-VNET loader tunables. + */ + if ((oidp->oid_kind & CTLFLAG_VNET) == 0) +#endif + oidp->oid_kind |= CTLFLAG_NOFETCH; /* try to fetch value from kernel environment */ sysctl_load_tunable_by_oid_locked(oidp); } } void sysctl_register_disabled_oid(struct sysctl_oid *oidp) { /* * Mark the leaf as dormant if it's not to be immediately enabled. * We do not disable nodes as they can be shared between modules * and it is always safe to access a node. */ KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0, ("internal flag is set in oid_kind")); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) oidp->oid_kind |= CTLFLAG_DORMANT; sysctl_register_oid(oidp); } void sysctl_enable_oid(struct sysctl_oid *oidp) { SYSCTL_ASSERT_WLOCKED(); if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0, ("sysctl node is marked as dormant")); return; } KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) != 0, ("enabling already enabled sysctl oid")); oidp->oid_kind &= ~CTLFLAG_DORMANT; } void sysctl_unregister_oid(struct sysctl_oid *oidp) { int error; SYSCTL_ASSERT_WLOCKED(); if (oidp->oid_number == OID_AUTO) { error = EINVAL; } else { error = ENOENT; if (RB_REMOVE(sysctl_oid_list, oidp->oid_parent, oidp)) error = 0; } /* * This can happen when a module fails to register and is * being unloaded afterwards. It should not be a panic() * for normal use. */ if (error) { printf("%s: failed(%d) to unregister sysctl(%s)\n", __func__, error, oidp->oid_name); } } /* Initialize a new context to keep track of dynamically added sysctls. */ int sysctl_ctx_init(struct sysctl_ctx_list *c) { if (c == NULL) { return (EINVAL); } /* * No locking here, the caller is responsible for not adding * new nodes to a context until after this function has * returned. */ TAILQ_INIT(c); return (0); } /* Free the context, and destroy all dynamic oids registered in this context */ int sysctl_ctx_free(struct sysctl_ctx_list *clist) { struct sysctl_ctx_entry *e, *e1; int error; error = 0; /* * First perform a "dry run" to check if it's ok to remove oids. * XXX FIXME * XXX This algorithm is a hack. But I don't know any * XXX better solution for now... */ SYSCTL_WLOCK(); TAILQ_FOREACH(e, clist, link) { error = sysctl_remove_oid_locked(e->entry, 0, 0); if (error) break; } /* * Restore deregistered entries, either from the end, * or from the place where error occurred. * e contains the entry that was not unregistered */ if (error) e1 = TAILQ_PREV(e, sysctl_ctx_list, link); else e1 = TAILQ_LAST(clist, sysctl_ctx_list); while (e1 != NULL) { sysctl_register_oid(e1->entry); e1 = TAILQ_PREV(e1, sysctl_ctx_list, link); } if (error) { SYSCTL_WUNLOCK(); return(EBUSY); } /* Now really delete the entries */ e = TAILQ_FIRST(clist); while (e != NULL) { e1 = TAILQ_NEXT(e, link); error = sysctl_remove_oid_locked(e->entry, 1, 0); if (error) panic("sysctl_remove_oid: corrupt tree, entry: %s", e->entry->oid_name); free(e, M_SYSCTLOID); e = e1; } SYSCTL_WUNLOCK(); return (error); } /* Add an entry to the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK); e->entry = oidp; TAILQ_INSERT_HEAD(clist, e, link); return (e); } /* Find an entry in the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); TAILQ_FOREACH(e, clist, link) { if (e->entry == oidp) return(e); } return (e); } /* * Delete an entry from the context. * NOTE: this function doesn't free oidp! You have to remove it * with sysctl_remove_oid(). */ int sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; if (clist == NULL || oidp == NULL) return (EINVAL); SYSCTL_WLOCK(); e = sysctl_ctx_entry_find(clist, oidp); if (e != NULL) { TAILQ_REMOVE(clist, e, link); SYSCTL_WUNLOCK(); free(e, M_SYSCTLOID); return (0); } else { SYSCTL_WUNLOCK(); return (ENOENT); } } /* * Remove dynamically created sysctl trees. * oidp - top of the tree to be removed * del - if 0 - just deregister, otherwise free up entries as well * recurse - if != 0 traverse the subtree to be deleted */ int sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse) { int error; SYSCTL_WLOCK(); error = sysctl_remove_oid_locked(oidp, del, recurse); SYSCTL_WUNLOCK(); return (error); } int sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del, int recurse) { struct sysctl_oid *p; int error; error = ENOENT; SYSCTL_WLOCK(); p = sysctl_find_oidname(name, &parent->oid_children); if (p) error = sysctl_remove_oid_locked(p, del, recurse); SYSCTL_WUNLOCK(); return (error); } /* * Duplicate the provided string, escaping any illegal characters. The result * must be freed when no longer in use. * * The list of illegal characters is ".". */ static char* sysctl_escape_name(const char* orig) { int i, s = 0, d = 0, nillegals = 0; char *new; /* First count the number of illegal characters */ for (i = 0; orig[i] != '\0'; i++) { if (orig[i] == '.') nillegals++; } /* Allocate storage for new string */ new = malloc(i + 2 * nillegals + 1, M_SYSCTLOID, M_WAITOK); /* Copy the name, escaping characters as we go */ while (orig[s] != '\0') { if (orig[s] == '.') { /* %25 is the hexadecimal representation of '.' */ new[d++] = '%'; new[d++] = '2'; new[d++] = '5'; s++; } else { new[d++] = orig[s++]; } } /* Finally, nul-terminate */ new[d] = '\0'; return (new); } static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse) { struct sysctl_oid *p, *tmp; int error; SYSCTL_ASSERT_WLOCKED(); if (oidp == NULL) return(EINVAL); if ((oidp->oid_kind & CTLFLAG_DYN) == 0) { printf("Warning: can't remove non-dynamic nodes (%s)!\n", oidp->oid_name); return (EINVAL); } /* * WARNING: normal method to do this should be through * sysctl_ctx_free(). Use recursing as the last resort * method to purge your sysctl tree of leftovers... * However, if some other code still references these nodes, * it will panic. */ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oidp->oid_refcnt == 1) { for(p = RB_MIN(sysctl_oid_list, &oidp->oid_children); p != NULL; p = tmp) { if (!recurse) { printf("Warning: failed attempt to " "remove oid %s with child %s\n", oidp->oid_name, p->oid_name); return (ENOTEMPTY); } tmp = RB_NEXT(sysctl_oid_list, &oidp->oid_children, p); error = sysctl_remove_oid_locked(p, del, recurse); if (error) return (error); } } } if (oidp->oid_refcnt > 1 ) { oidp->oid_refcnt--; } else { if (oidp->oid_refcnt == 0) { printf("Warning: bad oid_refcnt=%u (%s)!\n", oidp->oid_refcnt, oidp->oid_name); return (EINVAL); } sysctl_unregister_oid(oidp); if (del) { /* * Wait for all threads running the handler to drain. * This preserves the previous behavior when the * sysctl lock was held across a handler invocation, * and is necessary for module unload correctness. */ while (oidp->oid_running > 0) { oidp->oid_kind |= CTLFLAG_DYING; SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0); } if (oidp->oid_descr) free(__DECONST(char *, oidp->oid_descr), M_SYSCTLOID); if (oidp->oid_label) free(__DECONST(char *, oidp->oid_label), M_SYSCTLOID); free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID); free(oidp, M_SYSCTLOID); } } return (0); } /* * Create new sysctls at run time. * clist may point to a valid context initialized with sysctl_ctx_init(). */ struct sysctl_oid * sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int number, const char *name, int kind, void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr, const char *label) { struct sysctl_oid *oidp; char *escaped; /* You have to hook up somewhere.. */ if (parent == NULL) return(NULL); escaped = sysctl_escape_name(name); /* Check if the node already exists, otherwise create it */ SYSCTL_WLOCK(); oidp = sysctl_find_oidname(escaped, parent); if (oidp != NULL) { free(escaped, M_SYSCTLOID); if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { oidp->oid_refcnt++; /* Update the context */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); SYSCTL_WUNLOCK(); return (oidp); } else { sysctl_warn_reuse(__func__, oidp); SYSCTL_WUNLOCK(); return (NULL); } } oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO); oidp->oid_parent = parent; RB_INIT(&oidp->oid_children); oidp->oid_number = number; oidp->oid_refcnt = 1; oidp->oid_name = escaped; oidp->oid_handler = handler; oidp->oid_kind = CTLFLAG_DYN | kind; oidp->oid_arg1 = arg1; oidp->oid_arg2 = arg2; oidp->oid_fmt = fmt; if (descr != NULL) oidp->oid_descr = strdup(descr, M_SYSCTLOID); if (label != NULL) oidp->oid_label = strdup(label, M_SYSCTLOID); /* Update the context, if used */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); /* Register this oid */ sysctl_register_oid(oidp); SYSCTL_WUNLOCK(); return (oidp); } /* * Rename an existing oid. */ void sysctl_rename_oid(struct sysctl_oid *oidp, const char *name) { char *newname; char *oldname; newname = strdup(name, M_SYSCTLOID); SYSCTL_WLOCK(); oldname = __DECONST(char *, oidp->oid_name); oidp->oid_name = newname; SYSCTL_WUNLOCK(); free(oldname, M_SYSCTLOID); } /* * Reparent an existing oid. */ int sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent) { struct sysctl_oid *oidp; SYSCTL_WLOCK(); if (oid->oid_parent == parent) { SYSCTL_WUNLOCK(); return (0); } oidp = sysctl_find_oidname(oid->oid_name, parent); if (oidp != NULL) { SYSCTL_WUNLOCK(); return (EEXIST); } sysctl_unregister_oid(oid); oid->oid_parent = parent; oid->oid_number = OID_AUTO; sysctl_register_oid(oid); SYSCTL_WUNLOCK(); return (0); } /* * Register the kernel's oids on startup. */ SET_DECLARE(sysctl_set, struct sysctl_oid); static void sysctl_register_all(void *arg) { struct sysctl_oid **oidp; sx_init(&sysctlmemlock, "sysctl mem"); sx_init(&sysctlstringlock, "sysctl string handler"); SYSCTL_INIT(); SYSCTL_WLOCK(); SET_FOREACH(oidp, sysctl_set) sysctl_register_oid(*oidp); SYSCTL_WUNLOCK(); } SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, NULL); +#ifdef VIMAGE +static void +sysctl_setenv_vnet(void *arg __unused, char *name) +{ + struct sysctl_oid *oidp; + int oid[CTL_MAXNAME]; + int error, nlen; + + SYSCTL_WLOCK(); + error = name2oid(name, oid, &nlen, &oidp); + if (error) + goto out; + + if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && + (oidp->oid_kind & CTLFLAG_VNET) != 0 && + (oidp->oid_kind & CTLFLAG_TUN) != 0 && + (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { + /* Update value from kernel environment */ + sysctl_load_tunable_by_oid_locked(oidp); + } +out: + SYSCTL_WUNLOCK(); +} + +static void +sysctl_unsetenv_vnet(void *arg __unused, char *name) +{ + struct sysctl_oid *oidp; + int oid[CTL_MAXNAME]; + int error, nlen; + + SYSCTL_WLOCK(); + /* + * The setenv / unsetenv event handlers are invoked by kern_setenv() / + * kern_unsetenv() without exclusive locks. It is rare but still possible + * that the invoke order of event handlers is different from that of + * kern_setenv() and kern_unsetenv(). + * Re-check environment variable string to make sure it is unset. + */ + if (testenv(name)) + goto out; + error = name2oid(name, oid, &nlen, &oidp); + if (error) + goto out; + + if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && + (oidp->oid_kind & CTLFLAG_VNET) != 0 && + (oidp->oid_kind & CTLFLAG_TUN) != 0 && + (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { + size_t size; + + switch (oidp->oid_kind & CTLTYPE) { + case CTLTYPE_INT: + case CTLTYPE_UINT: + size = sizeof(int); + break; + case CTLTYPE_LONG: + case CTLTYPE_ULONG: + size = sizeof(long); + break; + case CTLTYPE_S8: + case CTLTYPE_U8: + size = sizeof(int8_t); + break; + case CTLTYPE_S16: + case CTLTYPE_U16: + size = sizeof(int16_t); + break; + case CTLTYPE_S32: + case CTLTYPE_U32: + size = sizeof(int32_t); + break; + case CTLTYPE_S64: + case CTLTYPE_U64: + size = sizeof(int64_t); + break; + case CTLTYPE_STRING: + MPASS(oidp->oid_arg2 > 0); + size = oidp->oid_arg2; + break; + default: + goto out; + } + vnet_restore_init(oidp->oid_arg1, size); + } +out: + SYSCTL_WUNLOCK(); +} + +/* + * Register the kernel's setenv / unsetenv events. + */ +EVENTHANDLER_DEFINE(setenv, sysctl_setenv_vnet, NULL, EVENTHANDLER_PRI_ANY); +EVENTHANDLER_DEFINE(unsetenv, sysctl_unsetenv_vnet, NULL, EVENTHANDLER_PRI_ANY); +#endif + /* * "Staff-functions" * * These functions implement a presently undocumented interface * used by the sysctl program to walk the tree, and get the type * so it can print the value. * This interface is under work and consideration, and should probably * be killed with a big axe by the first person who can find the time. * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. * * {CTL_SYSCTL, CTL_SYSCTL_DEBUG} printf the entire MIB-tree. * {CTL_SYSCTL, CTL_SYSCTL_NAME, ...} return the name of the "..." * OID. * {CTL_SYSCTL, CTL_SYSCTL_NEXT, ...} return the next OID, honoring * CTLFLAG_SKIP. * {CTL_SYSCTL, CTL_SYSCTL_NAME2OID} return the OID of the name in * "new" * {CTL_SYSCTL, CTL_SYSCTL_OIDFMT, ...} return the kind & format info * for the "..." OID. * {CTL_SYSCTL, CTL_SYSCTL_OIDDESCR, ...} return the description of the * "..." OID. * {CTL_SYSCTL, CTL_SYSCTL_OIDLABEL, ...} return the aggregation label of * the "..." OID. * {CTL_SYSCTL, CTL_SYSCTL_NEXTNOSKIP, ...} return the next OID, ignoring * CTLFLAG_SKIP. */ #ifdef SYSCTL_DEBUG static void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) { int k; struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SYSCTL_FOREACH(oidp, l) { for (k=0; koid_number, oidp->oid_name); printf("%c%c", oidp->oid_kind & CTLFLAG_RD ? 'R':' ', oidp->oid_kind & CTLFLAG_WR ? 'W':' '); if (oidp->oid_handler) printf(" *Handler"); switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_NODE: printf(" Node\n"); if (!oidp->oid_handler) { sysctl_sysctl_debug_dump_node( SYSCTL_CHILDREN(oidp), i + 2); } break; case CTLTYPE_INT: printf(" Int\n"); break; case CTLTYPE_UINT: printf(" u_int\n"); break; case CTLTYPE_LONG: printf(" Long\n"); break; case CTLTYPE_ULONG: printf(" u_long\n"); break; case CTLTYPE_STRING: printf(" String\n"); break; case CTLTYPE_S8: printf(" int8_t\n"); break; case CTLTYPE_S16: printf(" int16_t\n"); break; case CTLTYPE_S32: printf(" int32_t\n"); break; case CTLTYPE_S64: printf(" int64_t\n"); break; case CTLTYPE_U8: printf(" uint8_t\n"); break; case CTLTYPE_U16: printf(" uint16_t\n"); break; case CTLTYPE_U32: printf(" uint32_t\n"); break; case CTLTYPE_U64: printf(" uint64_t\n"); break; case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; default: printf("\n"); } } } static int sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; int error; error = priv_check(req->td, PRIV_SYSCTL_DEBUG); if (error) return (error); SYSCTL_RLOCK(&tracker); sysctl_sysctl_debug_dump_node(&sysctl__children, 0); SYSCTL_RUNLOCK(&tracker); return (ENOENT); } SYSCTL_PROC(_sysctl, CTL_SYSCTL_DEBUG, debug, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", ""); #endif static int sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int error; struct sysctl_oid *oid, key; struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; struct rm_priotracker tracker; char buf[10]; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); while (namelen) { if (!lsp) { snprintf(buf,sizeof(buf),"%d",*name); if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, buf, strlen(buf)); if (error) goto out; namelen--; name++; continue; } lsp2 = NULL; key.oid_number = *name; oid = RB_FIND(sysctl_oid_list, lsp, &key); if (oid) { if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, oid->oid_name, strlen(oid->oid_name)); if (error) goto out; namelen--; name++; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE && !oid->oid_handler) lsp2 = SYSCTL_CHILDREN(oid); } lsp = lsp2; } error = SYSCTL_OUT(req, "", 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } /* * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in * capability mode. */ static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NAME, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, ""); enum sysctl_iter_action { ITER_SIBLINGS, /* Not matched, continue iterating siblings */ ITER_CHILDREN, /* Node has children we need to iterate over them */ ITER_FOUND, /* Matching node was found */ }; /* * Tries to find the next node for @name and @namelen. * * Returns next action to take. */ static enum sysctl_iter_action sysctl_sysctl_next_node(struct sysctl_oid *oidp, int *name, unsigned int namelen, bool honor_skip) { if ((oidp->oid_kind & CTLFLAG_DORMANT) != 0) return (ITER_SIBLINGS); if (honor_skip && (oidp->oid_kind & CTLFLAG_SKIP) != 0) return (ITER_SIBLINGS); if (namelen == 0) { /* * We have reached a node with a full name match and are * looking for the next oid in its children. * * For CTL_SYSCTL_NEXTNOSKIP we are done. * * For CTL_SYSCTL_NEXT we skip CTLTYPE_NODE (unless it * has a handler) and move on to the children. */ if (!honor_skip) return (ITER_FOUND); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (ITER_FOUND); /* If node does not have an iterator, treat it as leaf */ if (oidp->oid_handler) return (ITER_FOUND); /* Report oid as a node to iterate */ return (ITER_CHILDREN); } /* * No match yet. Continue seeking the given name. * * We are iterating in order by oid_number, so skip oids lower * than the one we are looking for. * * When the current oid_number is higher than the one we seek, * that means we have reached the next oid in the sequence and * should return it. * * If the oid_number matches the name at this level then we * have to find a node to continue searching at the next level. */ if (oidp->oid_number < *name) return (ITER_SIBLINGS); if (oidp->oid_number > *name) { /* * We have reached the next oid. * * For CTL_SYSCTL_NEXTNOSKIP we are done. * * For CTL_SYSCTL_NEXT we skip CTLTYPE_NODE (unless it * has a handler) and move on to the children. */ if (!honor_skip) return (ITER_FOUND); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (ITER_FOUND); /* If node does not have an iterator, treat it as leaf */ if (oidp->oid_handler) return (ITER_FOUND); return (ITER_CHILDREN); } /* match at a current level */ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (ITER_SIBLINGS); if (oidp->oid_handler) return (ITER_SIBLINGS); return (ITER_CHILDREN); } /* * Recursively walk the sysctl subtree at lsp until we find the given name. * Returns true and fills in next oid data in @next and @len if oid is found. */ static bool sysctl_sysctl_next_action(struct sysctl_oid_list *lsp, int *name, u_int namelen, int *next, int *len, int level, bool honor_skip) { struct sysctl_oid_list *next_lsp; struct sysctl_oid *oidp = NULL, key; bool success = false; enum sysctl_iter_action action; SYSCTL_ASSERT_LOCKED(); /* * Start the search at the requested oid. But if not found, then scan * through all children. */ if (namelen > 0) { key.oid_number = *name; oidp = RB_FIND(sysctl_oid_list, lsp, &key); } if (!oidp) oidp = RB_MIN(sysctl_oid_list, lsp); for(; oidp != NULL; oidp = RB_NEXT(sysctl_oid_list, lsp, oidp)) { action = sysctl_sysctl_next_node(oidp, name, namelen, honor_skip); if (action == ITER_SIBLINGS) continue; if (action == ITER_FOUND) { success = true; break; } KASSERT((action== ITER_CHILDREN), ("ret(%d)!=ITER_CHILDREN", action)); next_lsp = SYSCTL_CHILDREN(oidp); if (namelen == 0) { success = sysctl_sysctl_next_action(next_lsp, NULL, 0, next + 1, len, level + 1, honor_skip); } else { success = sysctl_sysctl_next_action(next_lsp, name + 1, namelen - 1, next + 1, len, level + 1, honor_skip); if (!success) { /* * We maintain the invariant that current node oid * is >= the oid provided in @name. * As there are no usable children at this node, * current node oid is strictly > than the requested * oid. * Hence, reduce namelen to 0 to allow for picking first * nodes/leafs in the next node in list. */ namelen = 0; } } if (success) break; } if (success) { *next = oidp->oid_number; if (level > *len) *len = level; } return (success); } static int sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int len, error; bool success; struct sysctl_oid_list *lsp = &sysctl__children; struct rm_priotracker tracker; int next[CTL_MAXNAME]; len = 0; SYSCTL_RLOCK(&tracker); success = sysctl_sysctl_next_action(lsp, name, namelen, next, &len, 1, oidp->oid_number == CTL_SYSCTL_NEXT); SYSCTL_RUNLOCK(&tracker); if (!success) return (ENOENT); error = SYSCTL_OUT(req, next, len * sizeof (int)); return (error); } /* * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in * capability mode. */ static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXT, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXTNOSKIP, nextnoskip, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static int name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp) { struct sysctl_oid *oidp; struct sysctl_oid_list *lsp = &sysctl__children; SYSCTL_ASSERT_LOCKED(); for (*len = 0; *len < CTL_MAXNAME;) { oidp = sysctl_find_oidname(strsep(&name, "."), lsp); if (oidp == NULL) return (ENOENT); *oid++ = oidp->oid_number; (*len)++; if (name == NULL || *name == '\0') { if (oidpp) *oidpp = oidp; return (0); } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oidp->oid_handler) break; lsp = SYSCTL_CHILDREN(oidp); } return (ENOENT); } static int sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) { char *p; int error, oid[CTL_MAXNAME], len = 0; struct sysctl_oid *op = NULL; struct rm_priotracker tracker; char buf[32]; if (!req->newlen) return (ENOENT); if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ return (ENAMETOOLONG); p = buf; if (req->newlen >= sizeof(buf)) p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK); error = SYSCTL_IN(req, p, req->newlen); if (error) { if (p != buf) free(p, M_SYSCTL); return (error); } p [req->newlen] = '\0'; SYSCTL_RLOCK(&tracker); error = name2oid(p, oid, &len, &op); SYSCTL_RUNLOCK(&tracker); if (p != buf) free(p, M_SYSCTL); if (error) return (error); error = SYSCTL_OUT(req, oid, len * sizeof *oid); return (error); } /* * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in * capability mode. */ SYSCTL_PROC(_sysctl, CTL_SYSCTL_NAME2OID, name2oid, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_fmt == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind)); if (error) goto out; error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDFMT, oidfmt, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, ""); static int sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_descr == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDDESCR, oiddescr, CTLFLAG_RD | CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, ""); static int sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_label == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, oid->oid_label, strlen(oid->oid_label) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDLABEL, oidlabel, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); /* * Default "handler" functions. */ /* * Handle a bool. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_bool(SYSCTL_HANDLER_ARGS) { uint8_t temp; int error; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) temp = *(bool *)arg1 ? 1 : 0; else temp = arg2 ? 1 : 0; error = SYSCTL_OUT(req, &temp, sizeof(temp)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else { error = SYSCTL_IN(req, &temp, sizeof(temp)); if (!error) *(bool *)arg1 = temp ? 1 : 0; } return (error); } /* * Handle an int8_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_8(SYSCTL_HANDLER_ARGS) { int8_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int8_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int16_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_16(SYSCTL_HANDLER_ARGS) { int16_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int16_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int32_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_32(SYSCTL_HANDLER_ARGS) { int32_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int32_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_int(SYSCTL_HANDLER_ARGS) { int tmpout, error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(int)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(int)); return (error); } /* * Based on sysctl_handle_int() convert milliseconds into ticks. * Note: this is used by TCP. */ int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) { int error, s, tt; tt = *(int *)arg1; s = (int)((int64_t)tt * 1000 / hz); error = sysctl_handle_int(oidp, &s, 0, req); if (error || !req->newptr) return (error); tt = (int)((int64_t)s * hz / 1000); if (tt < 1) return (EINVAL); *(int *)arg1 = tt; return (0); } /* * Handle a long, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_long(SYSCTL_HANDLER_ARGS) { int error = 0; long tmplong; #ifdef SCTL_MASK32 int tmpint; #endif /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmplong = *(long *)arg1; else tmplong = arg2; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { tmpint = tmplong; error = SYSCTL_OUT(req, &tmpint, sizeof(int)); } else #endif error = SYSCTL_OUT(req, &tmplong, sizeof(long)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; #ifdef SCTL_MASK32 else if (req->flags & SCTL_MASK32) { error = SYSCTL_IN(req, &tmpint, sizeof(int)); *(long *)arg1 = (long)tmpint; } #endif else error = SYSCTL_IN(req, arg1, sizeof(long)); return (error); } /* * Handle a 64 bit int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_64(SYSCTL_HANDLER_ARGS) { int error = 0; uint64_t tmpout; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(uint64_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(uint64_t)); return (error); } /* * Handle our generic '\0' terminated 'C' string. * Two cases: * a variable string: point arg1 at it, arg2 is max length. * a constant string: point arg1 at it, arg2 is zero. */ int sysctl_handle_string(SYSCTL_HANDLER_ARGS) { char *tmparg; size_t outlen; int error = 0, ro_string = 0; /* * If the sysctl isn't writable and isn't a preallocated tunable that * can be modified by kenv(2), microoptimise and treat it as a * read-only string. * A zero-length buffer indicates a fixed size read-only * string. In ddb, don't worry about trying to make a malloced * snapshot. */ if ((oidp->oid_kind & (CTLFLAG_WR | CTLFLAG_TUN)) == 0 || arg2 == 0 || kdb_active) { arg2 = strlen((char *)arg1) + 1; ro_string = 1; } if (req->oldptr != NULL) { if (ro_string) { tmparg = arg1; outlen = strlen(tmparg) + 1; } else { tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK); sx_slock(&sysctlstringlock); memcpy(tmparg, arg1, arg2); sx_sunlock(&sysctlstringlock); outlen = strlen(tmparg) + 1; } error = SYSCTL_OUT(req, tmparg, outlen); if (!ro_string) free(tmparg, M_SYSCTLTMP); } else { if (!ro_string) sx_slock(&sysctlstringlock); outlen = strlen((char *)arg1) + 1; if (!ro_string) sx_sunlock(&sysctlstringlock); error = SYSCTL_OUT(req, NULL, outlen); } if (error || !req->newptr) return (error); if (req->newlen - req->newidx >= arg2 || req->newlen - req->newidx < 0) { error = EINVAL; } else if (req->newlen - req->newidx == 0) { sx_xlock(&sysctlstringlock); ((char *)arg1)[0] = '\0'; sx_xunlock(&sysctlstringlock); } else if (req->newfunc == sysctl_new_kernel) { arg2 = req->newlen - req->newidx; sx_xlock(&sysctlstringlock); error = SYSCTL_IN(req, arg1, arg2); if (error == 0) { ((char *)arg1)[arg2] = '\0'; req->newidx += arg2; } sx_xunlock(&sysctlstringlock); } else { arg2 = req->newlen - req->newidx; tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK); error = SYSCTL_IN(req, tmparg, arg2); if (error) { free(tmparg, M_SYSCTLTMP); return (error); } sx_xlock(&sysctlstringlock); memcpy(arg1, tmparg, arg2); ((char *)arg1)[arg2] = '\0'; sx_xunlock(&sysctlstringlock); free(tmparg, M_SYSCTLTMP); req->newidx += arg2; } return (error); } /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. */ int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS) { int error, tries; u_int generation; struct sysctl_req req2; /* * Attempt to get a coherent snapshot, by using the thread * pre-emption counter updated from within mi_switch() to * determine if we were pre-empted during a bcopy() or * copyout(). Make 3 attempts at doing this before giving up. * If we encounter an error, stop immediately. */ tries = 0; req2 = *req; retry: generation = curthread->td_generation; error = SYSCTL_OUT(req, arg1, arg2); if (error) return (error); tries++; if (generation != curthread->td_generation && tries < 3) { *req = req2; goto retry; } error = SYSCTL_IN(req, arg1, arg2); return (error); } /* * Based on sysctl_handle_64() convert microseconds to a sbintime. */ int sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS) { int error; int64_t usec; usec = sbttous(*(sbintime_t *)arg1); error = sysctl_handle_64(oidp, &usec, 0, req); if (error || !req->newptr) return (error); *(sbintime_t *)arg1 = ustosbt(usec); return (0); } /* * Based on sysctl_handle_64() convert milliseconds to a sbintime. */ int sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS) { int error; int64_t msec; msec = sbttoms(*(sbintime_t *)arg1); error = sysctl_handle_64(oidp, &msec, 0, req); if (error || !req->newptr) return (error); *(sbintime_t *)arg1 = mstosbt(msec); return (0); } /* * Convert seconds to a struct timeval. Intended for use with * intervals and thus does not permit negative seconds. */ int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS) { struct timeval *tv; int error, secs; tv = arg1; secs = tv->tv_sec; error = sysctl_handle_int(oidp, &secs, 0, req); if (error || req->newptr == NULL) return (error); if (secs < 0) return (EINVAL); tv->tv_sec = secs; return (0); } /* * Transfer functions to/from kernel space. * XXX: rather untested at this point */ static int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) { size_t i = 0; if (req->oldptr) { i = l; if (req->oldlen <= req->oldidx) i = 0; else if (i > req->oldlen - req->oldidx) i = req->oldlen - req->oldidx; if (i > 0) bcopy(p, (char *)req->oldptr + req->oldidx, i); } req->oldidx += l; if (req->oldptr && i != l) return (ENOMEM); return (0); } static int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); bcopy((const char *)req->newptr + req->newidx, p, l); req->newidx += l; return (0); } int kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int error = 0; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr= old; } if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; error = sysctl_root(0, name, namelen, &req); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } int kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int oid[CTL_MAXNAME]; size_t oidlen, plen; int error; oid[0] = CTL_SYSCTL; oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, strlen(name), &plen, flags); if (error) return (error); error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp, new, newlen, retval, flags); return (error); } /* * Transfer function to/from user space. */ static int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) { size_t i, len, origidx; int error; origidx = req->oldidx; req->oldidx += l; if (req->oldptr == NULL) return (0); /* * If we have not wired the user supplied buffer and we are currently * holding locks, drop a witness warning, as it's possible that * write operations to the user page can sleep. */ if (req->lock != REQ_WIRED) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_old_user()"); i = l; len = req->validlen; if (len <= origidx) i = 0; else { if (i > len - origidx) i = len - origidx; if (req->lock == REQ_WIRED) { error = copyout_nofault(p, (char *)req->oldptr + origidx, i); } else error = copyout(p, (char *)req->oldptr + origidx, i); if (error != 0) return (error); } if (i < l) return (ENOMEM); return (0); } static int sysctl_new_user(struct sysctl_req *req, void *p, size_t l) { int error; if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_new_user()"); error = copyin((const char *)req->newptr + req->newidx, p, l); req->newidx += l; return (error); } /* * Wire the user space destination buffer. If set to a value greater than * zero, the len parameter limits the maximum amount of wired memory. */ int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len) { int ret; size_t wiredlen; wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen; ret = 0; if (req->lock != REQ_WIRED && req->oldptr && req->oldfunc == sysctl_old_user) { if (wiredlen != 0) { ret = vslock(req->oldptr, wiredlen); if (ret != 0) { if (ret != ENOMEM) return (ret); wiredlen = 0; } } req->lock = REQ_WIRED; req->validlen = wiredlen; } return (0); } int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, int *nindx, struct sysctl_req *req) { struct sysctl_oid_list *lsp; struct sysctl_oid *oid; struct sysctl_oid key; int indx; SYSCTL_ASSERT_LOCKED(); lsp = &sysctl__children; indx = 0; while (indx < CTL_MAXNAME) { key.oid_number = name[indx]; oid = RB_FIND(sysctl_oid_list, lsp, &key); if (oid == NULL) return (ENOENT); indx++; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oid->oid_handler != NULL || indx == namelen) { *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } lsp = SYSCTL_CHILDREN(oid); } else if (indx == namelen) { if ((oid->oid_kind & CTLFLAG_DORMANT) != 0) return (ENOENT); *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } else { return (ENOTDIR); } } return (ENOENT); } /* * Traverse our tree, and find the right node, execute whatever it points * to, and return the resulting error code. */ static int sysctl_root(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error, indx, lvl; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, &indx, req); if (error) goto out; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { /* * You can't call a sysctl when it's a node, but has * no handler. Inform the user that it's a node. * The indx may or may not be the same as namelen. */ if (oid->oid_handler == NULL) { error = EISDIR; goto out; } } /* Is this sysctl writable? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) { error = EPERM; goto out; } KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL")); #ifdef CAPABILITY_MODE /* * If the process is in capability mode, then don't permit reading or * writing unless specifically granted for the node. */ if (IN_CAPABILITY_MODE(req->td)) { if ((req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD)) || (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))) { error = EPERM; goto out; } } #endif /* Is this sysctl sensitive to securelevels? */ if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) { lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE; error = securelevel_gt(req->td->td_ucred, lvl); if (error) goto out; } /* Is this sysctl writable by only privileged users? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) { int priv; if (oid->oid_kind & CTLFLAG_PRISON) priv = PRIV_SYSCTL_WRITEJAIL; #ifdef VIMAGE else if ((oid->oid_kind & CTLFLAG_VNET) && prison_owns_vnet(req->td->td_ucred)) priv = PRIV_SYSCTL_WRITEJAIL; #endif else priv = PRIV_SYSCTL_WRITE; error = priv_check(req->td, priv); if (error) goto out; } if (!oid->oid_handler) { error = EINVAL; goto out; } if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { arg1 = (int *)arg1 + indx; arg2 -= indx; } else { arg1 = oid->oid_arg1; arg2 = oid->oid_arg2; } #ifdef MAC error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2, req); if (error != 0) goto out; #endif #ifdef VIMAGE if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL) arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); #endif error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker); out: SYSCTL_RUNLOCK(&tracker); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __sysctl_args { int *name; u_int namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif int sys___sysctl(struct thread *td, struct __sysctl_args *uap) { int error, i, name[CTL_MAXNAME]; size_t j; if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) return (EINVAL); error = copyin(uap->name, &name, uap->namelen * sizeof(int)); if (error) return (error); error = userland_sysctl(td, name, uap->namelen, uap->old, uap->oldlenp, 0, uap->new, uap->newlen, &j, 0); if (error && error != ENOMEM) return (error); if (uap->oldlenp) { i = copyout(&j, uap->oldlenp, sizeof(j)); if (i) return (i); } return (error); } int kern___sysctlbyname(struct thread *td, const char *oname, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags, bool inkernel) { int oid[CTL_MAXNAME]; char namebuf[16]; char *name; size_t oidlen; int error; if (namelen > MAXPATHLEN || namelen == 0) return (EINVAL); name = namebuf; if (namelen > sizeof(namebuf)) name = malloc(namelen, M_SYSCTL, M_WAITOK); error = copyin(oname, name, namelen); if (error != 0) goto out; oid[0] = CTL_SYSCTL; oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, namelen, retval, flags); if (error != 0) goto out; error = userland_sysctl(td, oid, *retval / sizeof(int), old, oldlenp, inkernel, new, newlen, retval, flags); out: if (namelen > sizeof(namebuf)) free(name, M_SYSCTL); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __sysctlbyname_args { const char *name; size_t namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif int sys___sysctlbyname(struct thread *td, struct __sysctlbyname_args *uap) { size_t rv; int error; error = kern___sysctlbyname(td, uap->name, uap->namelen, uap->old, uap->oldlenp, uap->new, uap->newlen, &rv, 0, 0); if (error != 0) return (error); if (uap->oldlenp != NULL) error = copyout(&rv, uap->oldlenp, sizeof(rv)); return (error); } /* * This is used from various compatibility syscalls too. That's why name * must be in kernel space. */ int userland_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, const void *new, size_t newlen, size_t *retval, int flags) { int error = 0, memlocked; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { if (inkernel) { req.oldlen = *oldlenp; } else { error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); if (error) return (error); } } req.validlen = req.oldlen; req.oldptr = old; if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_user; req.newfunc = sysctl_new_user; req.lock = REQ_UNWIRED; #ifdef KTRACE if (KTRPOINT(curthread, KTR_SYSCTL)) ktrsysctl(name, namelen); #endif memlocked = 0; if (req.oldptr && req.oldlen > 4 * PAGE_SIZE) { memlocked = 1; sx_xlock(&sysctlmemlock); } CURVNET_SET(TD_TO_VNET(td)); for (;;) { req.oldidx = 0; req.newidx = 0; error = sysctl_root(0, name, namelen, &req); if (error != EAGAIN) break; kern_yield(PRI_USER); } CURVNET_RESTORE(); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (memlocked) sx_xunlock(&sysctlmemlock); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } /* * Drain into a sysctl struct. The user buffer should be wired if a page * fault would cause issue. */ static int sbuf_sysctl_drain(void *arg, const char *data, int len) { struct sysctl_req *req = arg; int error; error = SYSCTL_OUT(req, data, len); KASSERT(error >= 0, ("Got unexpected negative value %d", error)); return (error == 0 ? len : -error); } struct sbuf * sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length, struct sysctl_req *req) { /* Supply a default buffer size if none given. */ if (buf == NULL && length == 0) length = 64; s = sbuf_new(s, buf, length, SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(s, sbuf_sysctl_drain, req); return (s); } #ifdef DDB /* The current OID the debugger is working with */ static struct sysctl_oid *g_ddb_oid; /* The current flags specified by the user */ static int g_ddb_sysctl_flags; /* Check to see if the last sysctl printed */ static int g_ddb_sysctl_printed; static const int ctl_sign[CTLTYPE+1] = { [CTLTYPE_INT] = 1, [CTLTYPE_LONG] = 1, [CTLTYPE_S8] = 1, [CTLTYPE_S16] = 1, [CTLTYPE_S32] = 1, [CTLTYPE_S64] = 1, }; static const int ctl_size[CTLTYPE+1] = { [CTLTYPE_INT] = sizeof(int), [CTLTYPE_UINT] = sizeof(u_int), [CTLTYPE_LONG] = sizeof(long), [CTLTYPE_ULONG] = sizeof(u_long), [CTLTYPE_S8] = sizeof(int8_t), [CTLTYPE_S16] = sizeof(int16_t), [CTLTYPE_S32] = sizeof(int32_t), [CTLTYPE_S64] = sizeof(int64_t), [CTLTYPE_U8] = sizeof(uint8_t), [CTLTYPE_U16] = sizeof(uint16_t), [CTLTYPE_U32] = sizeof(uint32_t), [CTLTYPE_U64] = sizeof(uint64_t), }; #define DB_SYSCTL_NAME_ONLY 0x001 /* Compare with -N */ #define DB_SYSCTL_VALUE_ONLY 0x002 /* Compare with -n */ #define DB_SYSCTL_OPAQUE 0x004 /* Compare with -o */ #define DB_SYSCTL_HEX 0x008 /* Compare with -x */ #define DB_SYSCTL_SAFE_ONLY 0x100 /* Only simple types */ static const char db_sysctl_modifs[] = { 'N', 'n', 'o', 'x', }; static const int db_sysctl_modif_values[] = { DB_SYSCTL_NAME_ONLY, DB_SYSCTL_VALUE_ONLY, DB_SYSCTL_OPAQUE, DB_SYSCTL_HEX, }; /* Handlers considered safe to print while recursing */ static int (* const db_safe_handlers[])(SYSCTL_HANDLER_ARGS) = { sysctl_handle_bool, sysctl_handle_8, sysctl_handle_16, sysctl_handle_32, sysctl_handle_64, sysctl_handle_int, sysctl_handle_long, sysctl_handle_string, sysctl_handle_opaque, }; /* * Use in place of sysctl_old_kernel to print sysctl values. * * Compare to the output handling in show_var from sbin/sysctl/sysctl.c */ static int sysctl_old_ddb(struct sysctl_req *req, const void *ptr, size_t len) { const u_char *val, *p; const char *sep1; size_t intlen, slen; uintmax_t umv; intmax_t mv; int sign, ctltype, hexlen, xflag, error; /* Suppress false-positive GCC uninitialized variable warnings */ mv = 0; umv = 0; slen = len; val = p = ptr; if (ptr == NULL) { error = 0; goto out; } /* We are going to print */ g_ddb_sysctl_printed = 1; xflag = g_ddb_sysctl_flags & DB_SYSCTL_HEX; ctltype = (g_ddb_oid->oid_kind & CTLTYPE); sign = ctl_sign[ctltype]; intlen = ctl_size[ctltype]; switch (ctltype) { case CTLTYPE_NODE: case CTLTYPE_STRING: db_printf("%.*s", (int) len, (const char *) p); error = 0; goto out; case CTLTYPE_INT: case CTLTYPE_UINT: case CTLTYPE_LONG: case CTLTYPE_ULONG: case CTLTYPE_S8: case CTLTYPE_S16: case CTLTYPE_S32: case CTLTYPE_S64: case CTLTYPE_U8: case CTLTYPE_U16: case CTLTYPE_U32: case CTLTYPE_U64: hexlen = 2 + (intlen * CHAR_BIT + 3) / 4; sep1 = ""; while (len >= intlen) { switch (ctltype) { case CTLTYPE_INT: case CTLTYPE_UINT: umv = *(const u_int *)p; mv = *(const int *)p; break; case CTLTYPE_LONG: case CTLTYPE_ULONG: umv = *(const u_long *)p; mv = *(const long *)p; break; case CTLTYPE_S8: case CTLTYPE_U8: umv = *(const uint8_t *)p; mv = *(const int8_t *)p; break; case CTLTYPE_S16: case CTLTYPE_U16: umv = *(const uint16_t *)p; mv = *(const int16_t *)p; break; case CTLTYPE_S32: case CTLTYPE_U32: umv = *(const uint32_t *)p; mv = *(const int32_t *)p; break; case CTLTYPE_S64: case CTLTYPE_U64: umv = *(const uint64_t *)p; mv = *(const int64_t *)p; break; } db_printf("%s", sep1); if (xflag) db_printf("%#0*jx", hexlen, umv); else if (!sign) db_printf("%ju", umv); else if (g_ddb_oid->oid_fmt[1] == 'K') { /* Kelvins are currently unsupported. */ error = EOPNOTSUPP; goto out; } else db_printf("%jd", mv); sep1 = " "; len -= intlen; p += intlen; } error = 0; goto out; case CTLTYPE_OPAQUE: /* TODO: Support struct functions. */ /* FALLTHROUGH */ default: db_printf("Format:%s Length:%zu Dump:0x", g_ddb_oid->oid_fmt, len); while (len-- && (xflag || p < val + 16)) db_printf("%02x", *p++); if (!xflag && len > 16) db_printf("..."); error = 0; goto out; } out: req->oldidx += slen; return (error); } /* * Avoid setting new sysctl values from the debugger */ static int sysctl_new_ddb(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); /* Changing sysctls from the debugger is currently unsupported */ return (EPERM); } /* * Run a sysctl handler with the DDB oldfunc and newfunc attached. * Instead of copying any output to a buffer we'll dump it right to * the console. */ static int db_sysctl(struct sysctl_oid *oidp, int *name, u_int namelen, void *old, size_t *oldlenp, size_t *retval, int flags) { struct sysctl_req req; int error; /* Setup the request */ bzero(&req, sizeof req); req.td = kdb_thread; req.oldfunc = sysctl_old_ddb; req.newfunc = sysctl_new_ddb; req.lock = REQ_UNWIRED; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr = old; } /* Setup our globals for sysctl_old_ddb */ g_ddb_oid = oidp; g_ddb_sysctl_flags = flags; g_ddb_sysctl_printed = 0; error = sysctl_root(0, name, namelen, &req); /* Reset globals */ g_ddb_oid = NULL; g_ddb_sysctl_flags = 0; if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } /* * Show a sysctl's name */ static void db_show_oid_name(int *oid, size_t nlen) { struct sysctl_oid *oidp; int qoid[CTL_MAXNAME + 2]; int error; qoid[0] = CTL_SYSCTL; qoid[1] = CTL_SYSCTL_NAME; memcpy(qoid + 2, oid, nlen * sizeof(int)); error = sysctl_find_oid(qoid, nlen + 2, &oidp, NULL, NULL); if (error) db_error("sysctl name oid"); error = db_sysctl(oidp, qoid, nlen + 2, NULL, NULL, NULL, 0); if (error) db_error("sysctl name"); } /* * Check to see if an OID is safe to print from ddb. */ static bool db_oid_safe(const struct sysctl_oid *oidp) { for (unsigned int i = 0; i < nitems(db_safe_handlers); ++i) { if (oidp->oid_handler == db_safe_handlers[i]) return (true); } return (false); } /* * Show a sysctl at a specific OID * Compare to the input handling in show_var from sbin/sysctl/sysctl.c */ static int db_show_oid(struct sysctl_oid *oidp, int *oid, size_t nlen, int flags) { int error, xflag, oflag, Nflag, nflag; size_t len; xflag = flags & DB_SYSCTL_HEX; oflag = flags & DB_SYSCTL_OPAQUE; nflag = flags & DB_SYSCTL_VALUE_ONLY; Nflag = flags & DB_SYSCTL_NAME_ONLY; if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_OPAQUE && (!xflag && !oflag)) return (0); if (Nflag) { db_show_oid_name(oid, nlen); error = 0; goto out; } if (!nflag) { db_show_oid_name(oid, nlen); db_printf(": "); } if ((flags & DB_SYSCTL_SAFE_ONLY) && !db_oid_safe(oidp)) { db_printf("Skipping, unsafe to print while recursing."); error = 0; goto out; } /* Try once, and ask about the size */ len = 0; error = db_sysctl(oidp, oid, nlen, NULL, NULL, &len, flags); if (error) goto out; if (!g_ddb_sysctl_printed) /* Lie about the size */ error = db_sysctl(oidp, oid, nlen, (void *) 1, &len, NULL, flags); out: db_printf("\n"); return (error); } /* * Show all sysctls under a specific OID * Compare to sysctl_all from sbin/sysctl/sysctl.c */ static int db_show_sysctl_all(int *oid, size_t len, int flags) { struct sysctl_oid *oidp; int qoid[CTL_MAXNAME + 2], next[CTL_MAXNAME]; size_t nlen; qoid[0] = CTL_SYSCTL; qoid[1] = CTL_SYSCTL_NEXT; if (len) { nlen = len; memcpy(&qoid[2], oid, nlen * sizeof(int)); } else { nlen = 1; qoid[2] = CTL_KERN; } for (;;) { int error; size_t nextsize = sizeof(next); error = kernel_sysctl(kdb_thread, qoid, nlen + 2, next, &nextsize, NULL, 0, &nlen, 0); if (error != 0) { if (error == ENOENT) return (0); else db_error("sysctl(next)"); } nlen /= sizeof(int); if (nlen < (unsigned int)len) return (0); if (memcmp(&oid[0], &next[0], len * sizeof(int)) != 0) return (0); /* Find the OID in question */ error = sysctl_find_oid(next, nlen, &oidp, NULL, NULL); if (error) return (error); (void)db_show_oid(oidp, next, nlen, flags | DB_SYSCTL_SAFE_ONLY); if (db_pager_quit) return (0); memcpy(&qoid[2 + len], &next[len], (nlen - len) * sizeof(int)); } } /* * Show a sysctl by its user facing string */ static int db_sysctlbyname(char *name, int flags) { struct sysctl_oid *oidp; int oid[CTL_MAXNAME]; int error, nlen; error = name2oid(name, oid, &nlen, &oidp); if (error) { return (error); } if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { db_show_sysctl_all(oid, nlen, flags); } else { error = db_show_oid(oidp, oid, nlen, flags); } return (error); } static void db_sysctl_cmd_usage(void) { db_printf( " sysctl [/Nnox] \n" " \n" " The name of the sysctl to show. \n" " \n" " Show a sysctl by hooking into SYSCTL_IN and SYSCTL_OUT. \n" " This will work for most sysctls, but should not be used \n" " with sysctls that are known to malloc. \n" " \n" " While recursing any \"unsafe\" sysctls will be skipped. \n" " Call sysctl directly on the sysctl to try printing the \n" " skipped sysctl. This is unsafe and may make the ddb \n" " session unusable. \n" " \n" " Arguments: \n" " /N Display only the name of the sysctl. \n" " /n Display only the value of the sysctl. \n" " /o Display opaque values. \n" " /x Display the sysctl in hex. \n" " \n" "For example: \n" "sysctl vm.v_free_min \n" "vn.v_free_min: 12669 \n" ); } /* * Show a specific sysctl similar to sysctl (8). */ DB_COMMAND_FLAGS(sysctl, db_sysctl_cmd, CS_OWN) { char name[TOK_STRING_SIZE]; int error, i, t, flags; /* Parse the modifiers */ t = db_read_token(); if (t == tSLASH || t == tMINUS) { t = db_read_token(); if (t != tIDENT) { db_printf("Bad modifier\n"); error = EINVAL; goto out; } db_strcpy(modif, db_tok_string); } else { db_unread_token(t); modif[0] = '\0'; } flags = 0; for (i = 0; i < nitems(db_sysctl_modifs); i++) { if (strchr(modif, db_sysctl_modifs[i])) { flags |= db_sysctl_modif_values[i]; } } /* Parse the sysctl names */ t = db_read_token(); if (t != tIDENT) { db_printf("Need sysctl name\n"); error = EINVAL; goto out; } /* Copy the name into a temporary buffer */ db_strcpy(name, db_tok_string); /* Ensure there is no trailing cruft */ t = db_read_token(); if (t != tEOL) { db_printf("Unexpected sysctl argument\n"); error = EINVAL; goto out; } error = db_sysctlbyname(name, flags); if (error == ENOENT) { db_printf("unknown oid: '%s'\n", db_tok_string); goto out; } else if (error) { db_printf("%s: error: %d\n", db_tok_string, error); goto out; } out: /* Ensure we eat all of our text */ db_flush_lex(); if (error == EINVAL) { db_sysctl_cmd_usage(); } } #endif /* DDB */ diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c index 568f1e1dbd95..eb7ce3828deb 100644 --- a/sys/kern/link_elf.c +++ b/sys/kern/link_elf.c @@ -1,2009 +1,2011 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1998-2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ddb.h" #include "opt_gdb.h" #include #include #include #include #include #ifdef SPARSE_MAPPING #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SPARSE_MAPPING #include #include #include #endif #include #include #include #include "linker_if.h" #define MAXSEGS 4 typedef struct elf_file { struct linker_file lf; /* Common fields */ int preloaded; /* Was file pre-loaded */ caddr_t address; /* Relocation address */ #ifdef SPARSE_MAPPING vm_object_t object; /* VM object to hold file pages */ #endif Elf_Dyn *dynamic; /* Symbol table etc. */ Elf_Hashelt nbuckets; /* DT_HASH info */ Elf_Hashelt nchains; const Elf_Hashelt *buckets; const Elf_Hashelt *chains; caddr_t hash; caddr_t strtab; /* DT_STRTAB */ int strsz; /* DT_STRSZ */ const Elf_Sym *symtab; /* DT_SYMTAB */ Elf_Addr *got; /* DT_PLTGOT */ const Elf_Rel *pltrel; /* DT_JMPREL */ int pltrelsize; /* DT_PLTRELSZ */ const Elf_Rela *pltrela; /* DT_JMPREL */ int pltrelasize; /* DT_PLTRELSZ */ const Elf_Rel *rel; /* DT_REL */ int relsize; /* DT_RELSZ */ const Elf_Rela *rela; /* DT_RELA */ int relasize; /* DT_RELASZ */ caddr_t modptr; const Elf_Sym *ddbsymtab; /* The symbol table we are using */ long ddbsymcnt; /* Number of symbols */ caddr_t ddbstrtab; /* String table */ long ddbstrcnt; /* number of bytes in string table */ caddr_t symbase; /* malloc'ed symbold base */ caddr_t strbase; /* malloc'ed string base */ caddr_t ctftab; /* CTF table */ long ctfcnt; /* number of bytes in CTF table */ caddr_t ctfoff; /* CTF offset table */ caddr_t typoff; /* Type offset table */ long typlen; /* Number of type entries. */ Elf_Addr pcpu_start; /* Pre-relocation pcpu set start. */ Elf_Addr pcpu_stop; /* Pre-relocation pcpu set stop. */ Elf_Addr pcpu_base; /* Relocated pcpu set address. */ #ifdef VIMAGE Elf_Addr vnet_start; /* Pre-relocation vnet set start. */ Elf_Addr vnet_stop; /* Pre-relocation vnet set stop. */ Elf_Addr vnet_base; /* Relocated vnet set address. */ #endif #ifdef GDB struct link_map gdb; /* hooks for gdb */ #endif } *elf_file_t; struct elf_set { Elf_Addr es_start; Elf_Addr es_stop; Elf_Addr es_base; TAILQ_ENTRY(elf_set) es_link; }; TAILQ_HEAD(elf_set_head, elf_set); #include static int link_elf_link_common_finish(linker_file_t); static int link_elf_link_preload(linker_class_t cls, const char *, linker_file_t *); static int link_elf_link_preload_finish(linker_file_t); static int link_elf_load_file(linker_class_t, const char *, linker_file_t *); static int link_elf_lookup_symbol(linker_file_t, const char *, c_linker_sym_t *); static int link_elf_lookup_debug_symbol(linker_file_t, const char *, c_linker_sym_t *); static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t *); static int link_elf_debug_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t*); static int link_elf_search_symbol(linker_file_t, caddr_t, c_linker_sym_t *, long *); static void link_elf_unload_file(linker_file_t); static void link_elf_unload_preload(linker_file_t); static int link_elf_lookup_set(linker_file_t, const char *, void ***, void ***, int *); static int link_elf_each_function_name(linker_file_t, int (*)(const char *, void *), void *); static int link_elf_each_function_nameval(linker_file_t, linker_function_nameval_callback_t, void *); static void link_elf_reloc_local(linker_file_t); static long link_elf_symtab_get(linker_file_t, const Elf_Sym **); static long link_elf_strtab_get(linker_file_t, caddr_t *); #ifdef VIMAGE static void link_elf_propagate_vnets(linker_file_t); #endif static int elf_lookup(linker_file_t, Elf_Size, int, Elf_Addr *); static kobj_method_t link_elf_methods[] = { KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol), KOBJMETHOD(linker_lookup_debug_symbol, link_elf_lookup_debug_symbol), KOBJMETHOD(linker_symbol_values, link_elf_symbol_values), KOBJMETHOD(linker_debug_symbol_values, link_elf_debug_symbol_values), KOBJMETHOD(linker_search_symbol, link_elf_search_symbol), KOBJMETHOD(linker_unload, link_elf_unload_file), KOBJMETHOD(linker_load_file, link_elf_load_file), KOBJMETHOD(linker_link_preload, link_elf_link_preload), KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish), KOBJMETHOD(linker_lookup_set, link_elf_lookup_set), KOBJMETHOD(linker_each_function_name, link_elf_each_function_name), KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval), KOBJMETHOD(linker_ctf_get, link_elf_ctf_get), KOBJMETHOD(linker_symtab_get, link_elf_symtab_get), KOBJMETHOD(linker_strtab_get, link_elf_strtab_get), #ifdef VIMAGE KOBJMETHOD(linker_propagate_vnets, link_elf_propagate_vnets), #endif KOBJMETHOD_END }; static struct linker_class link_elf_class = { #if ELF_TARG_CLASS == ELFCLASS32 "elf32", #else "elf64", #endif link_elf_methods, sizeof(struct elf_file) }; static bool link_elf_leak_locals = true; SYSCTL_BOOL(_debug, OID_AUTO, link_elf_leak_locals, CTLFLAG_RWTUN, &link_elf_leak_locals, 0, "Allow local symbols to participate in global module symbol resolution"); typedef int (*elf_reloc_fn)(linker_file_t lf, Elf_Addr relocbase, const void *data, int type, elf_lookup_fn lookup); static int parse_dynamic(elf_file_t); static int relocate_file(elf_file_t); static int relocate_file1(elf_file_t ef, elf_lookup_fn lookup, elf_reloc_fn reloc, bool ifuncs); static int link_elf_preload_parse_symbols(elf_file_t); static struct elf_set_head set_pcpu_list; #ifdef VIMAGE static struct elf_set_head set_vnet_list; #endif static void elf_set_add(struct elf_set_head *list, Elf_Addr start, Elf_Addr stop, Elf_Addr base) { struct elf_set *set, *iter; set = malloc(sizeof(*set), M_LINKER, M_WAITOK); set->es_start = start; set->es_stop = stop; set->es_base = base; TAILQ_FOREACH(iter, list, es_link) { KASSERT((set->es_start < iter->es_start && set->es_stop < iter->es_stop) || (set->es_start > iter->es_start && set->es_stop > iter->es_stop), ("linker sets intersection: to insert: 0x%jx-0x%jx; inserted: 0x%jx-0x%jx", (uintmax_t)set->es_start, (uintmax_t)set->es_stop, (uintmax_t)iter->es_start, (uintmax_t)iter->es_stop)); if (iter->es_start > set->es_start) { TAILQ_INSERT_BEFORE(iter, set, es_link); break; } } if (iter == NULL) TAILQ_INSERT_TAIL(list, set, es_link); } static int elf_set_find(struct elf_set_head *list, Elf_Addr addr, Elf_Addr *start, Elf_Addr *base) { struct elf_set *set; TAILQ_FOREACH(set, list, es_link) { if (addr < set->es_start) return (0); if (addr < set->es_stop) { *start = set->es_start; *base = set->es_base; return (1); } } return (0); } static void elf_set_delete(struct elf_set_head *list, Elf_Addr start) { struct elf_set *set; TAILQ_FOREACH(set, list, es_link) { if (start < set->es_start) break; if (start == set->es_start) { TAILQ_REMOVE(list, set, es_link); free(set, M_LINKER); return; } } KASSERT(0, ("deleting unknown linker set (start = 0x%jx)", (uintmax_t)start)); } #ifdef GDB static void r_debug_state(struct r_debug *, struct link_map *); /* * A list of loaded modules for GDB to use for loading symbols. */ struct r_debug r_debug; #define GDB_STATE(s) do { \ r_debug.r_state = s; r_debug_state(NULL, NULL); \ } while (0) /* * Function for the debugger to set a breakpoint on to gain control. */ static void r_debug_state(struct r_debug *dummy_one __unused, struct link_map *dummy_two __unused) { } static void link_elf_add_gdb(struct link_map *l) { struct link_map *prev; l->l_next = NULL; if (r_debug.r_map == NULL) { /* Add first. */ l->l_prev = NULL; r_debug.r_map = l; } else { /* Append to list. */ for (prev = r_debug.r_map; prev->l_next != NULL; prev = prev->l_next) ; l->l_prev = prev; prev->l_next = l; } } static void link_elf_delete_gdb(struct link_map *l) { if (l->l_prev == NULL) { /* Remove first. */ if ((r_debug.r_map = l->l_next) != NULL) l->l_next->l_prev = NULL; } else { /* Remove any but first. */ if ((l->l_prev->l_next = l->l_next) != NULL) l->l_next->l_prev = l->l_prev; } } #endif /* GDB */ /* * The kernel symbol table starts here. */ extern struct _dynamic _DYNAMIC; static void link_elf_error(const char *filename, const char *s) { if (filename == NULL) printf("kldload: %s\n", s); else printf("kldload: %s: %s\n", filename, s); } static void link_elf_invoke_ctors(caddr_t addr, size_t size) { void (**ctor)(void); size_t i, cnt; if (addr == NULL || size == 0) return; cnt = size / sizeof(*ctor); ctor = (void *)addr; for (i = 0; i < cnt; i++) { if (ctor[i] != NULL) (*ctor[i])(); } } /* * Actions performed after linking/loading both the preloaded kernel and any * modules; whether preloaded or dynamicly loaded. */ static int link_elf_link_common_finish(linker_file_t lf) { #ifdef GDB elf_file_t ef = (elf_file_t)lf; char *newfilename; #endif int error; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error != 0) return (error); #ifdef GDB GDB_STATE(RT_ADD); ef->gdb.l_addr = lf->address; newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK); strcpy(newfilename, lf->filename); ef->gdb.l_name = newfilename; ef->gdb.l_ld = ef->dynamic; link_elf_add_gdb(&ef->gdb); GDB_STATE(RT_CONSISTENT); #endif /* Invoke .ctors */ link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size); return (0); } #ifdef RELOCATABLE_KERNEL /* * __startkernel and __endkernel are symbols set up as relocation canaries. * * They are defined in locore to reference linker script symbols at the * beginning and end of the LOAD area. This has the desired side effect of * giving us variables that have relative relocations pointing at them, so * relocation of the kernel object will cause the variables to be updated * automatically by the runtime linker when we initialize. * * There are two main reasons to relocate the kernel: * 1) If the loader needed to load the kernel at an alternate load address. * 2) If the kernel is switching address spaces on machines like POWER9 * under Radix where the high bits of the effective address are used to * differentiate between hypervisor, host, guest, and problem state. */ extern vm_offset_t __startkernel, __endkernel; #endif static unsigned long kern_relbase = KERNBASE; SYSCTL_ULONG(_kern, OID_AUTO, base_address, CTLFLAG_RD, SYSCTL_NULL_ULONG_PTR, KERNBASE, "Kernel base address"); SYSCTL_ULONG(_kern, OID_AUTO, relbase_address, CTLFLAG_RD, &kern_relbase, 0, "Kernel relocated base address"); static void link_elf_init(void* arg) { Elf_Dyn *dp; Elf_Addr *ctors_addrp; Elf_Size *ctors_sizep; caddr_t modptr, baseptr, sizeptr; elf_file_t ef; const char *modname; linker_add_class(&link_elf_class); dp = (Elf_Dyn *)&_DYNAMIC; modname = NULL; modptr = preload_search_by_type("elf" __XSTRING(__ELF_WORD_SIZE) " kernel"); if (modptr == NULL) modptr = preload_search_by_type("elf kernel"); modname = (char *)preload_search_info(modptr, MODINFO_NAME); if (modname == NULL) modname = "kernel"; linker_kernel_file = linker_make_file(modname, &link_elf_class); if (linker_kernel_file == NULL) panic("%s: Can't create linker structures for kernel", __func__); ef = (elf_file_t) linker_kernel_file; ef->preloaded = 1; #ifdef RELOCATABLE_KERNEL /* Compute relative displacement */ ef->address = (caddr_t) (__startkernel - KERNBASE); #else ef->address = 0; #endif #ifdef SPARSE_MAPPING ef->object = NULL; #endif ef->dynamic = dp; if (dp != NULL) parse_dynamic(ef); #ifdef RELOCATABLE_KERNEL linker_kernel_file->address = (caddr_t)__startkernel; linker_kernel_file->size = (intptr_t)(__endkernel - __startkernel); kern_relbase = (unsigned long)__startkernel; #else linker_kernel_file->address += KERNBASE; linker_kernel_file->size = -(intptr_t)linker_kernel_file->address; #endif if (modptr != NULL) { ef->modptr = modptr; baseptr = preload_search_info(modptr, MODINFO_ADDR); if (baseptr != NULL) linker_kernel_file->address = *(caddr_t *)baseptr; sizeptr = preload_search_info(modptr, MODINFO_SIZE); if (sizeptr != NULL) linker_kernel_file->size = *(size_t *)sizeptr; ctors_addrp = (Elf_Addr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_ADDR); ctors_sizep = (Elf_Size *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_SIZE); if (ctors_addrp != NULL && ctors_sizep != NULL) { linker_kernel_file->ctors_addr = ef->address + *ctors_addrp; linker_kernel_file->ctors_size = *ctors_sizep; } } (void)link_elf_preload_parse_symbols(ef); #ifdef GDB r_debug.r_map = NULL; r_debug.r_brk = r_debug_state; r_debug.r_state = RT_CONSISTENT; #endif (void)link_elf_link_common_finish(linker_kernel_file); linker_kernel_file->flags |= LINKER_FILE_LINKED; TAILQ_INIT(&set_pcpu_list); #ifdef VIMAGE TAILQ_INIT(&set_vnet_list); + vnet_save_init((void *)VNET_START, VNET_STOP - VNET_START); #endif } SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, NULL); static int link_elf_preload_parse_symbols(elf_file_t ef) { caddr_t pointer; caddr_t ssym, esym, base; caddr_t strtab; int strcnt; Elf_Sym *symtab; int symcnt; if (ef->modptr == NULL) return (0); pointer = preload_search_info(ef->modptr, MODINFO_METADATA | MODINFOMD_SSYM); if (pointer == NULL) return (0); ssym = *(caddr_t *)pointer; pointer = preload_search_info(ef->modptr, MODINFO_METADATA | MODINFOMD_ESYM); if (pointer == NULL) return (0); esym = *(caddr_t *)pointer; base = ssym; symcnt = *(long *)base; base += sizeof(long); symtab = (Elf_Sym *)base; base += roundup(symcnt, sizeof(long)); if (base > esym || base < ssym) { printf("Symbols are corrupt!\n"); return (EINVAL); } strcnt = *(long *)base; base += sizeof(long); strtab = base; base += roundup(strcnt, sizeof(long)); if (base > esym || base < ssym) { printf("Symbols are corrupt!\n"); return (EINVAL); } ef->ddbsymtab = symtab; ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); ef->ddbstrtab = strtab; ef->ddbstrcnt = strcnt; return (0); } static int parse_dynamic(elf_file_t ef) { Elf_Dyn *dp; int plttype = DT_REL; for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { switch (dp->d_tag) { case DT_HASH: { /* From src/libexec/rtld-elf/rtld.c */ const Elf_Hashelt *hashtab = (const Elf_Hashelt *) (ef->address + dp->d_un.d_ptr); ef->nbuckets = hashtab[0]; ef->nchains = hashtab[1]; ef->buckets = hashtab + 2; ef->chains = ef->buckets + ef->nbuckets; break; } case DT_STRTAB: ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr); break; case DT_STRSZ: ef->strsz = dp->d_un.d_val; break; case DT_SYMTAB: ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr); break; case DT_SYMENT: if (dp->d_un.d_val != sizeof(Elf_Sym)) return (ENOEXEC); break; case DT_PLTGOT: ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr); break; case DT_REL: ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); break; case DT_RELSZ: ef->relsize = dp->d_un.d_val; break; case DT_RELENT: if (dp->d_un.d_val != sizeof(Elf_Rel)) return (ENOEXEC); break; case DT_JMPREL: ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); break; case DT_PLTRELSZ: ef->pltrelsize = dp->d_un.d_val; break; case DT_RELA: ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr); break; case DT_RELASZ: ef->relasize = dp->d_un.d_val; break; case DT_RELAENT: if (dp->d_un.d_val != sizeof(Elf_Rela)) return (ENOEXEC); break; case DT_PLTREL: plttype = dp->d_un.d_val; if (plttype != DT_REL && plttype != DT_RELA) return (ENOEXEC); break; #ifdef GDB case DT_DEBUG: dp->d_un.d_ptr = (Elf_Addr)&r_debug; break; #endif } } if (plttype == DT_RELA) { ef->pltrela = (const Elf_Rela *)ef->pltrel; ef->pltrel = NULL; ef->pltrelasize = ef->pltrelsize; ef->pltrelsize = 0; } ef->ddbsymtab = ef->symtab; ef->ddbsymcnt = ef->nchains; ef->ddbstrtab = ef->strtab; ef->ddbstrcnt = ef->strsz; return elf_cpu_parse_dynamic(ef->address, ef->dynamic); } #define LS_PADDING 0x90909090 static int parse_dpcpu(elf_file_t ef) { int error, size; #if defined(__i386__) uint32_t pad; #endif ef->pcpu_start = 0; ef->pcpu_stop = 0; error = link_elf_lookup_set(&ef->lf, "pcpu", (void ***)&ef->pcpu_start, (void ***)&ef->pcpu_stop, NULL); /* Error just means there is no pcpu set to relocate. */ if (error != 0) return (0); size = (uintptr_t)ef->pcpu_stop - (uintptr_t)ef->pcpu_start; /* Empty set? */ if (size < 1) return (0); #if defined(__i386__) /* In case we do find __start/stop_set_ symbols double-check. */ if (size < 4) { uprintf("Kernel module '%s' must be recompiled with " "linker script\n", ef->lf.pathname); return (ENOEXEC); } /* Padding from linker-script correct? */ pad = *(uint32_t *)((uintptr_t)ef->pcpu_stop - sizeof(pad)); if (pad != LS_PADDING) { uprintf("Kernel module '%s' must be recompiled with " "linker script, invalid padding %#04x (%#04x)\n", ef->lf.pathname, pad, LS_PADDING); return (ENOEXEC); } /* If we only have valid padding, nothing to do. */ if (size == 4) return (0); #endif /* * Allocate space in the primary pcpu area. Copy in our * initialization from the data section and then initialize * all per-cpu storage from that. */ ef->pcpu_base = (Elf_Addr)(uintptr_t)dpcpu_alloc(size); if (ef->pcpu_base == 0) { printf("%s: pcpu module space is out of space; " "cannot allocate %d for %s\n", __func__, size, ef->lf.pathname); return (ENOSPC); } memcpy((void *)ef->pcpu_base, (void *)ef->pcpu_start, size); dpcpu_copy((void *)ef->pcpu_base, size); elf_set_add(&set_pcpu_list, ef->pcpu_start, ef->pcpu_stop, ef->pcpu_base); return (0); } #ifdef VIMAGE static int parse_vnet(elf_file_t ef) { int error, size; #if defined(__i386__) uint32_t pad; #endif ef->vnet_start = 0; ef->vnet_stop = 0; ef->vnet_base = 0; error = link_elf_lookup_set(&ef->lf, "vnet", (void ***)&ef->vnet_start, (void ***)&ef->vnet_stop, NULL); /* Error just means there is no vnet data set to relocate. */ if (error != 0) return (0); size = (uintptr_t)ef->vnet_stop - (uintptr_t)ef->vnet_start; /* Empty set? */ if (size < 1) return (0); #if defined(__i386__) /* In case we do find __start/stop_set_ symbols double-check. */ if (size < 4) { uprintf("Kernel module '%s' must be recompiled with " "linker script\n", ef->lf.pathname); return (ENOEXEC); } /* Padding from linker-script correct? */ pad = *(uint32_t *)((uintptr_t)ef->vnet_stop - sizeof(pad)); if (pad != LS_PADDING) { uprintf("Kernel module '%s' must be recompiled with " "linker script, invalid padding %#04x (%#04x)\n", ef->lf.pathname, pad, LS_PADDING); return (ENOEXEC); } /* If we only have valid padding, nothing to do. */ if (size == 4) return (0); #endif /* * Allocate space in the primary vnet area. Copy in our * initialization from the data section and then initialize * all per-vnet storage from that. */ ef->vnet_base = (Elf_Addr)(uintptr_t)vnet_data_alloc(size); if (ef->vnet_base == 0) { printf("%s: vnet module space is out of space; " "cannot allocate %d for %s\n", __func__, size, ef->lf.pathname); return (ENOSPC); } memcpy((void *)ef->vnet_base, (void *)ef->vnet_start, size); + vnet_save_init((void *)ef->vnet_base, size); elf_set_add(&set_vnet_list, ef->vnet_start, ef->vnet_stop, ef->vnet_base); return (0); } #endif #undef LS_PADDING /* * Apply the specified protection to the loadable segments of a preloaded linker * file. */ static int preload_protect(elf_file_t ef, vm_prot_t prot) { #if defined(__aarch64__) || defined(__amd64__) Elf_Ehdr *hdr; Elf_Phdr *phdr, *phlimit; vm_prot_t nprot; int error; error = 0; hdr = (Elf_Ehdr *)ef->address; phdr = (Elf_Phdr *)(ef->address + hdr->e_phoff); phlimit = phdr + hdr->e_phnum; for (; phdr < phlimit; phdr++) { if (phdr->p_type != PT_LOAD) continue; nprot = prot | VM_PROT_READ; if ((phdr->p_flags & PF_W) != 0) nprot |= VM_PROT_WRITE; if ((phdr->p_flags & PF_X) != 0) nprot |= VM_PROT_EXECUTE; error = pmap_change_prot((vm_offset_t)ef->address + phdr->p_vaddr, round_page(phdr->p_memsz), nprot); if (error != 0) break; } return (error); #else return (0); #endif } #ifdef __arm__ /* * Locate the ARM exception/unwind table info for DDB and stack(9) use by * searching for the section header that describes it. There may be no unwind * info, for example in a module containing only data. */ static void link_elf_locate_exidx(linker_file_t lf, Elf_Shdr *shdr, int nhdr) { int i; for (i = 0; i < nhdr; i++) { if (shdr[i].sh_type == SHT_ARM_EXIDX) { lf->exidx_addr = shdr[i].sh_addr + lf->address; lf->exidx_size = shdr[i].sh_size; break; } } } /* * Locate the section headers metadata in a preloaded module, then use it to * locate the exception/unwind table in the module. The size of the metadata * block is stored in a uint32 word immediately before the data itself, and a * comment in preload_search_info() says it is safe to rely on that. */ static void link_elf_locate_exidx_preload(struct linker_file *lf, caddr_t modptr) { uint32_t *modinfo; Elf_Shdr *shdr; uint32_t nhdr; modinfo = (uint32_t *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_SHDR); if (modinfo != NULL) { shdr = (Elf_Shdr *)modinfo; nhdr = modinfo[-1] / sizeof(Elf_Shdr); link_elf_locate_exidx(lf, shdr, nhdr); } } #endif /* __arm__ */ static int link_elf_link_preload(linker_class_t cls, const char *filename, linker_file_t *result) { Elf_Addr *ctors_addrp; Elf_Size *ctors_sizep; caddr_t modptr, baseptr, sizeptr, dynptr; char *type; elf_file_t ef; linker_file_t lf; int error; vm_offset_t dp; /* Look to see if we have the file preloaded */ modptr = preload_search_by_name(filename); if (modptr == NULL) return (ENOENT); type = (char *)preload_search_info(modptr, MODINFO_TYPE); baseptr = preload_search_info(modptr, MODINFO_ADDR); sizeptr = preload_search_info(modptr, MODINFO_SIZE); dynptr = preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_DYNAMIC); if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " module") != 0 && strcmp(type, "elf module") != 0)) return (EFTYPE); if (baseptr == NULL || sizeptr == NULL || dynptr == NULL) return (EINVAL); lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) return (ENOMEM); ef = (elf_file_t) lf; ef->preloaded = 1; ef->modptr = modptr; ef->address = *(caddr_t *)baseptr; #ifdef SPARSE_MAPPING ef->object = NULL; #endif dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr; ef->dynamic = (Elf_Dyn *)dp; lf->address = ef->address; lf->size = *(size_t *)sizeptr; ctors_addrp = (Elf_Addr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_ADDR); ctors_sizep = (Elf_Size *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_SIZE); if (ctors_addrp != NULL && ctors_sizep != NULL) { lf->ctors_addr = ef->address + *ctors_addrp; lf->ctors_size = *ctors_sizep; } #ifdef __arm__ link_elf_locate_exidx_preload(lf, modptr); #endif error = parse_dynamic(ef); if (error == 0) error = parse_dpcpu(ef); #ifdef VIMAGE if (error == 0) error = parse_vnet(ef); #endif if (error == 0) error = preload_protect(ef, VM_PROT_ALL); if (error != 0) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } link_elf_reloc_local(lf); *result = lf; return (0); } static int link_elf_link_preload_finish(linker_file_t lf) { elf_file_t ef; int error; ef = (elf_file_t) lf; error = relocate_file(ef); if (error == 0) error = preload_protect(ef, VM_PROT_NONE); if (error != 0) return (error); (void)link_elf_preload_parse_symbols(ef); return (link_elf_link_common_finish(lf)); } static int link_elf_load_file(linker_class_t cls, const char* filename, linker_file_t* result) { struct nameidata nd; struct thread* td = curthread; /* XXX */ Elf_Ehdr *hdr; caddr_t firstpage, segbase; int nbytes, i; Elf_Phdr *phdr; Elf_Phdr *phlimit; Elf_Phdr *segs[MAXSEGS]; int nsegs; Elf_Phdr *phdyn; caddr_t mapbase; size_t mapsize; Elf_Addr base_vaddr; Elf_Addr base_vlimit; int error = 0; ssize_t resid; int flags; elf_file_t ef; linker_file_t lf; Elf_Shdr *shdr; int symtabindex; int symstrindex; int shstrindex; int symcnt; int strcnt; char *shstrs; shdr = NULL; lf = NULL; shstrs = NULL; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error != 0) return (error); NDFREE_PNBUF(&nd); if (nd.ni_vp->v_type != VREG) { error = ENOEXEC; firstpage = NULL; goto out; } #ifdef MAC error = mac_kld_check_load(curthread->td_ucred, nd.ni_vp); if (error != 0) { firstpage = NULL; goto out; } #endif /* * Read the elf header from the file. */ firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK); hdr = (Elf_Ehdr *)firstpage; error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); nbytes = PAGE_SIZE - resid; if (error != 0) goto out; if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } /* * We rely on the program header being in the first page. * This is not strictly required by the ABI specification, but * it seems to always true in practice. And, it simplifies * things considerably. */ if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) && (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) && (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes))) link_elf_error(filename, "Unreadable program headers"); /* * Scan the program header entries, and save key information. * * We rely on there being exactly two load segments, text and data, * in that order. */ phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff); phlimit = phdr + hdr->e_phnum; nsegs = 0; phdyn = NULL; while (phdr < phlimit) { switch (phdr->p_type) { case PT_LOAD: if (nsegs == MAXSEGS) { link_elf_error(filename, "Too many sections"); error = ENOEXEC; goto out; } /* * XXX: We just trust they come in right order ?? */ segs[nsegs] = phdr; ++nsegs; break; case PT_DYNAMIC: phdyn = phdr; break; case PT_INTERP: error = ENOSYS; goto out; } ++phdr; } if (phdyn == NULL) { link_elf_error(filename, "Object is not dynamically-linked"); error = ENOEXEC; goto out; } if (nsegs == 0) { link_elf_error(filename, "No sections"); error = ENOEXEC; goto out; } /* * Allocate the entire address space of the object, to stake * out our contiguous region, and to establish the base * address for relocation. */ base_vaddr = trunc_page(segs[0]->p_vaddr); base_vlimit = round_page(segs[nsegs - 1]->p_vaddr + segs[nsegs - 1]->p_memsz); mapsize = base_vlimit - base_vaddr; lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) { error = ENOMEM; goto out; } ef = (elf_file_t) lf; #ifdef SPARSE_MAPPING ef->object = vm_pager_allocate(OBJT_PHYS, NULL, mapsize, VM_PROT_ALL, 0, thread0.td_ucred); if (ef->object == NULL) { error = ENOMEM; goto out; } #ifdef __amd64__ mapbase = (caddr_t)KERNBASE; #else mapbase = (caddr_t)vm_map_min(kernel_map); #endif /* * Mapping protections are downgraded after relocation processing. */ error = vm_map_find(kernel_map, ef->object, 0, (vm_offset_t *)&mapbase, mapsize, 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != 0) { vm_object_deallocate(ef->object); ef->object = NULL; goto out; } #else mapbase = malloc_exec(mapsize, M_LINKER, M_WAITOK); #endif ef->address = mapbase; /* * Read the text and data sections and zero the bss. */ for (i = 0; i < nsegs; i++) { segbase = mapbase + segs[i]->p_vaddr - base_vaddr; #ifdef SPARSE_MAPPING /* * Consecutive segments may have different mapping permissions, * so be strict and verify that their mappings do not overlap. */ if (((vm_offset_t)segbase & PAGE_MASK) != 0) { error = EINVAL; goto out; } error = vm_map_wire(kernel_map, (vm_offset_t)segbase, (vm_offset_t)segbase + round_page(segs[i]->p_memsz), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } #endif error = vn_rdwr(UIO_READ, nd.ni_vp, segbase, segs[i]->p_filesz, segs[i]->p_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; bzero(segbase + segs[i]->p_filesz, segs[i]->p_memsz - segs[i]->p_filesz); } ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr); lf->address = ef->address; lf->size = mapsize; error = parse_dynamic(ef); if (error != 0) goto out; error = parse_dpcpu(ef); if (error != 0) goto out; #ifdef VIMAGE error = parse_vnet(ef); if (error != 0) goto out; #endif link_elf_reloc_local(lf); VOP_UNLOCK(nd.ni_vp); error = linker_load_dependencies(lf); vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto out; error = relocate_file(ef); if (error != 0) goto out; #ifdef SPARSE_MAPPING /* * Downgrade permissions on text segment mappings now that relocation * processing is complete. Restrict permissions on read-only segments. */ for (i = 0; i < nsegs; i++) { vm_prot_t prot; if (segs[i]->p_type != PT_LOAD) continue; prot = VM_PROT_READ; if ((segs[i]->p_flags & PF_W) != 0) prot |= VM_PROT_WRITE; if ((segs[i]->p_flags & PF_X) != 0) prot |= VM_PROT_EXECUTE; segbase = mapbase + segs[i]->p_vaddr - base_vaddr; error = vm_map_protect(kernel_map, (vm_offset_t)segbase, (vm_offset_t)segbase + round_page(segs[i]->p_memsz), prot, 0, VM_MAP_PROTECT_SET_PROT); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } } #endif /* * Try and load the symbol table if it's present. (you can * strip it!) */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0) goto nosyms; shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; /* Read section string table */ shstrindex = hdr->e_shstrndx; if (shstrindex != 0 && shdr[shstrindex].sh_type == SHT_STRTAB && shdr[shstrindex].sh_size != 0) { nbytes = shdr[shstrindex].sh_size; shstrs = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shstrs, nbytes, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; } symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_type == SHT_SYMTAB) { symtabindex = i; symstrindex = shdr[i].sh_link; } else if (shstrs != NULL && shdr[i].sh_name != 0 && strcmp(shstrs + shdr[i].sh_name, ".ctors") == 0) { /* Record relocated address and size of .ctors. */ lf->ctors_addr = mapbase + shdr[i].sh_addr - base_vaddr; lf->ctors_size = shdr[i].sh_size; } } if (symtabindex < 0 || symstrindex < 0) goto nosyms; symcnt = shdr[symtabindex].sh_size; ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK); strcnt = shdr[symstrindex].sh_size; ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->symbase, symcnt, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; error = vn_rdwr(UIO_READ, nd.ni_vp, ef->strbase, strcnt, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); ef->ddbsymtab = (const Elf_Sym *)ef->symbase; ef->ddbstrcnt = strcnt; ef->ddbstrtab = ef->strbase; nosyms: #ifdef __arm__ link_elf_locate_exidx(lf, shdr, hdr->e_shnum); #endif error = link_elf_link_common_finish(lf); if (error != 0) goto out; *result = lf; out: VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (error != 0 && lf != NULL) linker_file_unload(lf, LINKER_UNLOAD_FORCE); free(shdr, M_LINKER); free(firstpage, M_LINKER); free(shstrs, M_LINKER); return (error); } Elf_Addr elf_relocaddr(linker_file_t lf, Elf_Addr x) { elf_file_t ef; KASSERT(lf->ops->cls == (kobj_class_t)&link_elf_class, ("elf_relocaddr: unexpected linker file %p", lf)); ef = (elf_file_t)lf; if (x >= ef->pcpu_start && x < ef->pcpu_stop) return ((x - ef->pcpu_start) + ef->pcpu_base); #ifdef VIMAGE if (x >= ef->vnet_start && x < ef->vnet_stop) return ((x - ef->vnet_start) + ef->vnet_base); #endif return (x); } static void link_elf_unload_file(linker_file_t file) { elf_file_t ef = (elf_file_t) file; if (ef->pcpu_base != 0) { dpcpu_free((void *)ef->pcpu_base, ef->pcpu_stop - ef->pcpu_start); elf_set_delete(&set_pcpu_list, ef->pcpu_start); } #ifdef VIMAGE if (ef->vnet_base != 0) { vnet_data_free((void *)ef->vnet_base, ef->vnet_stop - ef->vnet_start); elf_set_delete(&set_vnet_list, ef->vnet_start); } #endif #ifdef GDB if (ef->gdb.l_ld != NULL) { GDB_STATE(RT_DELETE); free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER); link_elf_delete_gdb(&ef->gdb); GDB_STATE(RT_CONSISTENT); } #endif /* Notify MD code that a module is being unloaded. */ elf_cpu_unload_file(file); if (ef->preloaded) { link_elf_unload_preload(file); return; } #ifdef SPARSE_MAPPING if (ef->object != NULL) { vm_map_remove(kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); } #else free(ef->address, M_LINKER); #endif free(ef->symbase, M_LINKER); free(ef->strbase, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); } static void link_elf_unload_preload(linker_file_t file) { if (file->pathname != NULL) preload_delete_name(file->pathname); } static const char * symbol_name(elf_file_t ef, Elf_Size r_info) { const Elf_Sym *ref; if (ELF_R_SYM(r_info)) { ref = ef->symtab + ELF_R_SYM(r_info); return (ef->strtab + ref->st_name); } return (NULL); } static int symbol_type(elf_file_t ef, Elf_Size r_info) { const Elf_Sym *ref; if (ELF_R_SYM(r_info)) { ref = ef->symtab + ELF_R_SYM(r_info); return (ELF_ST_TYPE(ref->st_info)); } return (STT_NOTYPE); } static int relocate_file1(elf_file_t ef, elf_lookup_fn lookup, elf_reloc_fn reloc, bool ifuncs) { const Elf_Rel *rel; const Elf_Rela *rela; const char *symname; TSENTER(); #define APPLY_RELOCS(iter, tbl, tblsize, type) do { \ for ((iter) = (tbl); (iter) != NULL && \ (iter) < (tbl) + (tblsize) / sizeof(*(iter)); (iter)++) { \ if ((symbol_type(ef, (iter)->r_info) == \ STT_GNU_IFUNC || \ elf_is_ifunc_reloc((iter)->r_info)) != ifuncs) \ continue; \ if (reloc(&ef->lf, (Elf_Addr)ef->address, \ (iter), (type), lookup)) { \ symname = symbol_name(ef, (iter)->r_info); \ printf("link_elf: symbol %s undefined\n", \ symname); \ return (ENOENT); \ } \ } \ } while (0) APPLY_RELOCS(rel, ef->rel, ef->relsize, ELF_RELOC_REL); TSENTER2("ef->rela"); APPLY_RELOCS(rela, ef->rela, ef->relasize, ELF_RELOC_RELA); TSEXIT2("ef->rela"); APPLY_RELOCS(rel, ef->pltrel, ef->pltrelsize, ELF_RELOC_REL); APPLY_RELOCS(rela, ef->pltrela, ef->pltrelasize, ELF_RELOC_RELA); #undef APPLY_RELOCS TSEXIT(); return (0); } static int relocate_file(elf_file_t ef) { int error; error = relocate_file1(ef, elf_lookup, elf_reloc, false); if (error == 0) error = relocate_file1(ef, elf_lookup, elf_reloc, true); return (error); } /* * SysV hash function for symbol table lookup. It is specified by the * System V ABI. */ static Elf32_Word elf_hash(const char *name) { const unsigned char *p = (const unsigned char *)name; Elf32_Word h = 0; while (*p != '\0') { h = (h << 4) + *p++; h ^= (h >> 24) & 0xf0; } return (h & 0x0fffffff); } static int link_elf_lookup_symbol1(linker_file_t lf, const char *name, c_linker_sym_t *sym, bool see_local) { elf_file_t ef = (elf_file_t) lf; unsigned long symnum; const Elf_Sym* symp; const char *strp; Elf32_Word hash; /* If we don't have a hash, bail. */ if (ef->buckets == NULL || ef->nbuckets == 0) { printf("link_elf_lookup_symbol: missing symbol hash table\n"); return (ENOENT); } /* First, search hashed global symbols */ hash = elf_hash(name); symnum = ef->buckets[hash % ef->nbuckets]; while (symnum != STN_UNDEF) { if (symnum >= ef->nchains) { printf("%s: corrupt symbol table\n", __func__); return (ENOENT); } symp = ef->symtab + symnum; if (symp->st_name == 0) { printf("%s: corrupt symbol table\n", __func__); return (ENOENT); } strp = ef->strtab + symp->st_name; if (strcmp(name, strp) == 0) { if (symp->st_shndx != SHN_UNDEF || (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC))) { if (see_local || ELF_ST_BIND(symp->st_info) != STB_LOCAL) { *sym = (c_linker_sym_t) symp; return (0); } } return (ENOENT); } symnum = ef->chains[symnum]; } return (ENOENT); } static int link_elf_lookup_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym) { if (link_elf_leak_locals) return (link_elf_lookup_debug_symbol(lf, name, sym)); return (link_elf_lookup_symbol1(lf, name, sym, false)); } static int link_elf_lookup_debug_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym* symp; const char *strp; int i; if (link_elf_lookup_symbol1(lf, name, sym, true) == 0) return (0); for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { strp = ef->ddbstrtab + symp->st_name; if (strcmp(name, strp) == 0) { if (symp->st_shndx != SHN_UNDEF || (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC))) { *sym = (c_linker_sym_t) symp; return (0); } return (ENOENT); } } return (ENOENT); } static int link_elf_symbol_values1(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval, bool see_local) { elf_file_t ef; const Elf_Sym *es; caddr_t val; ef = (elf_file_t)lf; es = (const Elf_Sym *)sym; if (es >= ef->symtab && es < ef->symtab + ef->nchains) { if (!see_local && ELF_ST_BIND(es->st_info) == STB_LOCAL) return (ENOENT); symval->name = ef->strtab + es->st_name; val = (caddr_t)ef->address + es->st_value; if (ELF_ST_TYPE(es->st_info) == STT_GNU_IFUNC) val = ((caddr_t (*)(void))val)(); symval->value = val; symval->size = es->st_size; return (0); } return (ENOENT); } static int link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval) { if (link_elf_leak_locals) return (link_elf_debug_symbol_values(lf, sym, symval)); return (link_elf_symbol_values1(lf, sym, symval, false)); } static int link_elf_debug_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym *es = (const Elf_Sym *)sym; caddr_t val; if (link_elf_symbol_values1(lf, sym, symval, true) == 0) return (0); if (ef->symtab == ef->ddbsymtab) return (ENOENT); if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) { symval->name = ef->ddbstrtab + es->st_name; val = (caddr_t)ef->address + es->st_value; if (ELF_ST_TYPE(es->st_info) == STT_GNU_IFUNC) val = ((caddr_t (*)(void))val)(); symval->value = val; symval->size = es->st_size; return (0); } return (ENOENT); } static int link_elf_search_symbol(linker_file_t lf, caddr_t value, c_linker_sym_t *sym, long *diffp) { elf_file_t ef = (elf_file_t)lf; u_long off = (uintptr_t)(void *)value; u_long diff = off; u_long st_value; const Elf_Sym *es; const Elf_Sym *best = NULL; int i; for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { if (es->st_name == 0) continue; st_value = es->st_value + (uintptr_t) (void *) ef->address; if (off >= st_value) { if (off - st_value < diff) { diff = off - st_value; best = es; if (diff == 0) break; } else if (off - st_value == diff) { best = es; } } } if (best == NULL) *diffp = off; else *diffp = diff; *sym = (c_linker_sym_t) best; return (0); } /* * Look up a linker set on an ELF system. */ static int link_elf_lookup_set(linker_file_t lf, const char *name, void ***startp, void ***stopp, int *countp) { c_linker_sym_t sym; linker_symval_t symval; char *setsym; void **start, **stop; int len, error = 0, count; len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */ setsym = malloc(len, M_LINKER, M_WAITOK); /* get address of first entry */ snprintf(setsym, len, "%s%s", "__start_set_", name); error = link_elf_lookup_symbol(lf, setsym, &sym); if (error != 0) goto out; link_elf_symbol_values(lf, sym, &symval); if (symval.value == 0) { error = ESRCH; goto out; } start = (void **)symval.value; /* get address of last entry */ snprintf(setsym, len, "%s%s", "__stop_set_", name); error = link_elf_lookup_symbol(lf, setsym, &sym); if (error != 0) goto out; link_elf_symbol_values(lf, sym, &symval); if (symval.value == 0) { error = ESRCH; goto out; } stop = (void **)symval.value; /* and the number of entries */ count = stop - start; /* and copy out */ if (startp != NULL) *startp = start; if (stopp != NULL) *stopp = stop; if (countp != NULL) *countp = count; out: free(setsym, M_LINKER); return (error); } static int link_elf_each_function_name(linker_file_t file, int (*callback)(const char *, void *), void *opaque) { elf_file_t ef = (elf_file_t)file; const Elf_Sym *symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC)) { error = callback(ef->ddbstrtab + symp->st_name, opaque); if (error != 0) return (error); } } return (0); } static int link_elf_each_function_nameval(linker_file_t file, linker_function_nameval_callback_t callback, void *opaque) { linker_symval_t symval; elf_file_t ef = (elf_file_t)file; const Elf_Sym *symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC)) { error = link_elf_debug_symbol_values(file, (c_linker_sym_t) symp, &symval); if (error == 0) error = callback(file, i, &symval, opaque); if (error != 0) return (error); } } return (0); } const Elf_Sym * elf_get_sym(linker_file_t lf, Elf_Size symidx) { elf_file_t ef = (elf_file_t)lf; if (symidx >= ef->nchains) return (NULL); return (ef->symtab + symidx); } const char * elf_get_symname(linker_file_t lf, Elf_Size symidx) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym *sym; if (symidx >= ef->nchains) return (NULL); sym = ef->symtab + symidx; return (ef->strtab + sym->st_name); } /* * Symbol lookup function that can be used when the symbol index is known (ie * in relocations). It uses the symbol index instead of doing a fully fledged * hash table based lookup when such is valid. For example for local symbols. * This is not only more efficient, it's also more correct. It's not always * the case that the symbol can be found through the hash table. */ static int elf_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym *sym; const char *symbol; Elf_Addr addr, start, base; /* Don't even try to lookup the symbol if the index is bogus. */ if (symidx >= ef->nchains) { *res = 0; return (EINVAL); } sym = ef->symtab + symidx; /* * Don't do a full lookup when the symbol is local. It may even * fail because it may not be found through the hash table. */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) { /* Force lookup failure when we have an insanity. */ if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0) { *res = 0; return (EINVAL); } *res = ((Elf_Addr)ef->address + sym->st_value); return (0); } /* * XXX we can avoid doing a hash table based lookup for global * symbols as well. This however is not always valid, so we'll * just do it the hard way for now. Performance tweaks can * always be added. */ symbol = ef->strtab + sym->st_name; /* Force a lookup failure if the symbol name is bogus. */ if (*symbol == 0) { *res = 0; return (EINVAL); } addr = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps)); if (addr == 0 && ELF_ST_BIND(sym->st_info) != STB_WEAK) { *res = 0; return (EINVAL); } if (elf_set_find(&set_pcpu_list, addr, &start, &base)) addr = addr - start + base; #ifdef VIMAGE else if (elf_set_find(&set_vnet_list, addr, &start, &base)) addr = addr - start + base; #endif *res = addr; return (0); } static void link_elf_reloc_local(linker_file_t lf) { const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; elf_file_t ef = (elf_file_t)lf; /* Perform relocations without addend if there are any: */ if ((rel = ef->rel) != NULL) { rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize); while (rel < rellim) { elf_reloc_local(lf, (Elf_Addr)ef->address, rel, ELF_RELOC_REL, elf_lookup); rel++; } } /* Perform relocations with addend if there are any: */ if ((rela = ef->rela) != NULL) { relalim = (const Elf_Rela *) ((const char *)ef->rela + ef->relasize); while (rela < relalim) { elf_reloc_local(lf, (Elf_Addr)ef->address, rela, ELF_RELOC_RELA, elf_lookup); rela++; } } } static long link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab) { elf_file_t ef = (elf_file_t)lf; *symtab = ef->ddbsymtab; if (*symtab == NULL) return (0); return (ef->ddbsymcnt); } static long link_elf_strtab_get(linker_file_t lf, caddr_t *strtab) { elf_file_t ef = (elf_file_t)lf; *strtab = ef->ddbstrtab; if (*strtab == NULL) return (0); return (ef->ddbstrcnt); } #ifdef VIMAGE static void link_elf_propagate_vnets(linker_file_t lf) { elf_file_t ef = (elf_file_t)lf; int size; if (ef->vnet_base != 0) { size = (uintptr_t)ef->vnet_stop - (uintptr_t)ef->vnet_start; vnet_data_copy((void *)ef->vnet_base, size); } } #endif #if defined(__i386__) || defined(__amd64__) || defined(__aarch64__) || defined(__powerpc__) /* * Use this lookup routine when performing relocations early during boot. * The generic lookup routine depends on kobj, which is not initialized * at that point. */ static int elf_lookup_ifunc(linker_file_t lf, Elf_Size symidx, int deps __unused, Elf_Addr *res) { elf_file_t ef; const Elf_Sym *symp; caddr_t val; ef = (elf_file_t)lf; symp = ef->symtab + symidx; if (ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC) { val = (caddr_t)ef->address + symp->st_value; *res = ((Elf_Addr (*)(void))val)(); return (0); } return (ENOENT); } void link_elf_ireloc(caddr_t kmdp) { struct elf_file eff; elf_file_t ef; TSENTER(); ef = &eff; bzero_early(ef, sizeof(*ef)); ef->modptr = kmdp; ef->dynamic = (Elf_Dyn *)&_DYNAMIC; #ifdef RELOCATABLE_KERNEL ef->address = (caddr_t) (__startkernel - KERNBASE); #else ef->address = 0; #endif parse_dynamic(ef); link_elf_preload_parse_symbols(ef); relocate_file1(ef, elf_lookup_ifunc, elf_reloc, true); TSEXIT(); } #if defined(__aarch64__) || defined(__amd64__) void link_elf_late_ireloc(void) { elf_file_t ef; KASSERT(linker_kernel_file != NULL, ("link_elf_late_ireloc: No kernel linker file found")); ef = (elf_file_t)linker_kernel_file; relocate_file1(ef, elf_lookup_ifunc, elf_reloc_late, true); } #endif #endif diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c index d4ad963e8181..0b2befc02c1a 100644 --- a/sys/kern/link_elf_obj.c +++ b/sys/kern/link_elf_obj.c @@ -1,1881 +1,1889 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1998-2000 Doug Rabson * Copyright (c) 2004 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB_CTF #include #endif #include "linker_if.h" typedef struct { void *addr; Elf_Off size; int flags; /* Section flags. */ int sec; /* Original section number. */ char *name; } Elf_progent; typedef struct { Elf_Rel *rel; int nrel; int sec; } Elf_relent; typedef struct { Elf_Rela *rela; int nrela; int sec; } Elf_relaent; typedef struct elf_file { struct linker_file lf; /* Common fields */ int preloaded; caddr_t address; /* Relocation address */ vm_object_t object; /* VM object to hold file pages */ Elf_Shdr *e_shdr; Elf_progent *progtab; u_int nprogtab; Elf_relaent *relatab; u_int nrelatab; Elf_relent *reltab; int nreltab; Elf_Sym *ddbsymtab; /* The symbol table we are using */ long ddbsymcnt; /* Number of symbols */ caddr_t ddbstrtab; /* String table */ long ddbstrcnt; /* number of bytes in string table */ caddr_t shstrtab; /* Section name string table */ long shstrcnt; /* number of bytes in string table */ caddr_t ctftab; /* CTF table */ long ctfcnt; /* number of bytes in CTF table */ caddr_t ctfoff; /* CTF offset table */ caddr_t typoff; /* Type offset table */ long typlen; /* Number of type entries. */ } *elf_file_t; #include static int link_elf_link_preload(linker_class_t cls, const char *, linker_file_t *); static int link_elf_link_preload_finish(linker_file_t); static int link_elf_load_file(linker_class_t, const char *, linker_file_t *); static int link_elf_lookup_symbol(linker_file_t, const char *, c_linker_sym_t *); static int link_elf_lookup_debug_symbol(linker_file_t, const char *, c_linker_sym_t *); static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t *); static int link_elf_debug_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t *); static int link_elf_search_symbol(linker_file_t, caddr_t value, c_linker_sym_t *sym, long *diffp); static void link_elf_unload_file(linker_file_t); static int link_elf_lookup_set(linker_file_t, const char *, void ***, void ***, int *); static int link_elf_each_function_name(linker_file_t, int (*)(const char *, void *), void *); static int link_elf_each_function_nameval(linker_file_t, linker_function_nameval_callback_t, void *); static int link_elf_reloc_local(linker_file_t, bool); static long link_elf_symtab_get(linker_file_t, const Elf_Sym **); static long link_elf_strtab_get(linker_file_t, caddr_t *); #ifdef VIMAGE static void link_elf_propagate_vnets(linker_file_t); #endif static int elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *); static kobj_method_t link_elf_methods[] = { KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol), KOBJMETHOD(linker_lookup_debug_symbol, link_elf_lookup_debug_symbol), KOBJMETHOD(linker_symbol_values, link_elf_symbol_values), KOBJMETHOD(linker_debug_symbol_values, link_elf_debug_symbol_values), KOBJMETHOD(linker_search_symbol, link_elf_search_symbol), KOBJMETHOD(linker_unload, link_elf_unload_file), KOBJMETHOD(linker_load_file, link_elf_load_file), KOBJMETHOD(linker_link_preload, link_elf_link_preload), KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish), KOBJMETHOD(linker_lookup_set, link_elf_lookup_set), KOBJMETHOD(linker_each_function_name, link_elf_each_function_name), KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval), KOBJMETHOD(linker_ctf_get, link_elf_ctf_get), KOBJMETHOD(linker_symtab_get, link_elf_symtab_get), KOBJMETHOD(linker_strtab_get, link_elf_strtab_get), #ifdef VIMAGE KOBJMETHOD(linker_propagate_vnets, link_elf_propagate_vnets), #endif KOBJMETHOD_END }; static struct linker_class link_elf_class = { #if ELF_TARG_CLASS == ELFCLASS32 "elf32_obj", #else "elf64_obj", #endif link_elf_methods, sizeof(struct elf_file) }; static bool link_elf_obj_leak_locals = true; SYSCTL_BOOL(_debug, OID_AUTO, link_elf_obj_leak_locals, CTLFLAG_RWTUN, &link_elf_obj_leak_locals, 0, "Allow local symbols to participate in global module symbol resolution"); static int relocate_file(elf_file_t ef); static void elf_obj_cleanup_globals_cache(elf_file_t); static void link_elf_error(const char *filename, const char *s) { if (filename == NULL) printf("kldload: %s\n", s); else printf("kldload: %s: %s\n", filename, s); } static void link_elf_init(void *arg) { linker_add_class(&link_elf_class); } SYSINIT(link_elf_obj, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, NULL); static void link_elf_protect_range(elf_file_t ef, vm_offset_t start, vm_offset_t end, vm_prot_t prot) { int error __unused; KASSERT(start <= end && start >= (vm_offset_t)ef->address && end <= round_page((vm_offset_t)ef->address + ef->lf.size), ("link_elf_protect_range: invalid range %#jx-%#jx", (uintmax_t)start, (uintmax_t)end)); if (start == end) return; if (ef->preloaded) { #ifdef __amd64__ error = pmap_change_prot(start, end - start, prot); KASSERT(error == 0, ("link_elf_protect_range: pmap_change_prot() returned %d", error)); #endif return; } error = vm_map_protect(kernel_map, start, end, prot, 0, VM_MAP_PROTECT_SET_PROT); KASSERT(error == KERN_SUCCESS, ("link_elf_protect_range: vm_map_protect() returned %d", error)); } /* * Restrict permissions on linker file memory based on section flags. * Sections need not be page-aligned, so overlap within a page is possible. */ static void link_elf_protect(elf_file_t ef) { vm_offset_t end, segend, segstart, start; vm_prot_t gapprot, prot, segprot; int i; /* * If the file was preloaded, the last page may contain other preloaded * data which may need to be writeable. ELF files are always * page-aligned, but other preloaded data, such as entropy or CPU * microcode may be loaded with a smaller alignment. */ gapprot = ef->preloaded ? VM_PROT_RW : VM_PROT_READ; start = end = (vm_offset_t)ef->address; prot = VM_PROT_READ; for (i = 0; i < ef->nprogtab; i++) { /* * VNET and DPCPU sections have their memory allocated by their * respective subsystems. */ if (ef->progtab[i].name != NULL && ( #ifdef VIMAGE strcmp(ef->progtab[i].name, VNET_SETNAME) == 0 || #endif strcmp(ef->progtab[i].name, DPCPU_SETNAME) == 0)) continue; segstart = trunc_page((vm_offset_t)ef->progtab[i].addr); segend = round_page((vm_offset_t)ef->progtab[i].addr + ef->progtab[i].size); segprot = VM_PROT_READ; if ((ef->progtab[i].flags & SHF_WRITE) != 0) segprot |= VM_PROT_WRITE; if ((ef->progtab[i].flags & SHF_EXECINSTR) != 0) segprot |= VM_PROT_EXECUTE; if (end <= segstart) { /* * Case 1: there is no overlap between the previous * segment and this one. Apply protections to the * previous segment, and protect the gap between the * previous and current segments, if any. */ link_elf_protect_range(ef, start, end, prot); link_elf_protect_range(ef, end, segstart, gapprot); start = segstart; end = segend; prot = segprot; } else if (start < segstart && end == segend) { /* * Case 2: the current segment is a subrange of the * previous segment. Apply protections to the * non-overlapping portion of the previous segment. */ link_elf_protect_range(ef, start, segstart, prot); start = segstart; prot |= segprot; } else if (end < segend) { /* * Case 3: there is partial overlap between the previous * and current segments. Apply protections to the * non-overlapping portion of the previous segment, and * then the overlap, which must use the union of the two * segments' protections. */ link_elf_protect_range(ef, start, segstart, prot); link_elf_protect_range(ef, segstart, end, prot | segprot); start = end; end = segend; prot = segprot; } else { /* * Case 4: the two segments reside in the same page. */ prot |= segprot; } } /* * Fix up the last unprotected segment and trailing data. */ link_elf_protect_range(ef, start, end, prot); link_elf_protect_range(ef, end, round_page((vm_offset_t)ef->address + ef->lf.size), gapprot); } static int link_elf_link_preload(linker_class_t cls, const char *filename, linker_file_t *result) { Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; void *modptr, *baseptr, *sizeptr; char *type; elf_file_t ef; linker_file_t lf; Elf_Addr off; int error, i, j, pb, ra, rl, shstrindex, symstrindex, symtabindex; /* Look to see if we have the file preloaded */ modptr = preload_search_by_name(filename); if (modptr == NULL) return ENOENT; type = (char *)preload_search_info(modptr, MODINFO_TYPE); baseptr = preload_search_info(modptr, MODINFO_ADDR); sizeptr = preload_search_info(modptr, MODINFO_SIZE); hdr = (Elf_Ehdr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_ELFHDR); shdr = (Elf_Shdr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_SHDR); if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " obj module") != 0 && strcmp(type, "elf obj module") != 0)) { return (EFTYPE); } if (baseptr == NULL || sizeptr == NULL || hdr == NULL || shdr == NULL) return (EINVAL); lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) return (ENOMEM); ef = (elf_file_t)lf; ef->preloaded = 1; ef->address = *(caddr_t *)baseptr; lf->address = *(caddr_t *)baseptr; lf->size = *(size_t *)sizeptr; if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT || hdr->e_type != ET_REL || hdr->e_machine != ELF_TARG_MACH) { error = EFTYPE; goto out; } ef->e_shdr = shdr; /* Scan the section header for information and table sizing. */ symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif case SHT_INIT_ARRAY: case SHT_FINI_ARRAY: /* Ignore sections not loaded by the loader. */ if (shdr[i].sh_addr == 0) break; ef->nprogtab++; break; case SHT_SYMTAB: symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: /* * Ignore relocation tables for sections not * loaded by the loader. */ if (shdr[shdr[i].sh_info].sh_addr == 0) break; ef->nreltab++; break; case SHT_RELA: if (shdr[shdr[i].sh_info].sh_addr == 0) break; ef->nrelatab++; break; } } shstrindex = hdr->e_shstrndx; if (ef->nprogtab == 0 || symstrindex < 0 || symstrindex >= hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB || shstrindex == 0 || shstrindex >= hdr->e_shnum || shdr[shstrindex].sh_type != SHT_STRTAB) { printf("%s: bad/missing section headers\n", filename); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if ((ef->nprogtab != 0 && ef->progtab == NULL) || (ef->nreltab != 0 && ef->reltab == NULL) || (ef->nrelatab != 0 && ef->relatab == NULL)) { error = ENOMEM; goto out; } /* XXX, relocate the sh_addr fields saved by the loader. */ off = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_addr != 0 && (off == 0 || shdr[i].sh_addr < off)) off = shdr[i].sh_addr; } for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_addr != 0) shdr[i].sh_addr = shdr[i].sh_addr - off + (Elf_Addr)ef->address; } ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = (Elf_Sym *)shdr[symtabindex].sh_addr; ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = (char *)shdr[symstrindex].sh_addr; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = (char *)shdr[shstrindex].sh_addr; /* Now fill out progtab and the relocation tables. */ pb = 0; rl = 0; ra = 0; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif case SHT_INIT_ARRAY: case SHT_FINI_ARRAY: if (shdr[i].sh_addr == 0) break; ef->progtab[pb].addr = (void *)shdr[i].sh_addr; if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<>"; #ifdef __amd64__ else if (shdr[i].sh_type == SHT_X86_64_UNWIND) ef->progtab[pb].name = "<>"; #endif else if (shdr[i].sh_type == SHT_INIT_ARRAY) ef->progtab[pb].name = "<>"; else if (shdr[i].sh_type == SHT_FINI_ARRAY) ef->progtab[pb].name = "<>"; else ef->progtab[pb].name = "<>"; ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].flags = shdr[i].sh_flags; ef->progtab[pb].sec = i; if (ef->shstrtab && shdr[i].sh_name != 0) ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) { void *dpcpu; dpcpu = dpcpu_alloc(shdr[i].sh_size); if (dpcpu == NULL) { printf("%s: pcpu module space is out " "of space; cannot allocate %#jx " "for %s\n", __func__, (uintmax_t)shdr[i].sh_size, filename); error = ENOSPC; goto out; } memcpy(dpcpu, ef->progtab[pb].addr, ef->progtab[pb].size); dpcpu_copy(dpcpu, shdr[i].sh_size); ef->progtab[pb].addr = dpcpu; #ifdef VIMAGE } else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) { void *vnet_data; vnet_data = vnet_data_alloc(shdr[i].sh_size); if (vnet_data == NULL) { printf("%s: vnet module space is out " "of space; cannot allocate %#jx " "for %s\n", __func__, (uintmax_t)shdr[i].sh_size, filename); error = ENOSPC; goto out; } memcpy(vnet_data, ef->progtab[pb].addr, ef->progtab[pb].size); ef->progtab[pb].addr = vnet_data; + vnet_save_init(ef->progtab[pb].addr, + ef->progtab[pb].size); #endif } else if ((ef->progtab[pb].name != NULL && strcmp(ef->progtab[pb].name, ".ctors") == 0) || shdr[i].sh_type == SHT_INIT_ARRAY) { if (lf->ctors_addr != 0) { printf( "%s: multiple ctor sections in %s\n", __func__, filename); } else { lf->ctors_addr = ef->progtab[pb].addr; lf->ctors_size = shdr[i].sh_size; } } else if ((ef->progtab[pb].name != NULL && strcmp(ef->progtab[pb].name, ".dtors") == 0) || shdr[i].sh_type == SHT_FINI_ARRAY) { if (lf->dtors_addr != 0) { printf( "%s: multiple dtor sections in %s\n", __func__, filename); } else { lf->dtors_addr = ef->progtab[pb].addr; lf->dtors_size = shdr[i].sh_size; } } /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr)ef->progtab[pb].addr; } pb++; break; case SHT_REL: if (shdr[shdr[i].sh_info].sh_addr == 0) break; ef->reltab[rl].rel = (Elf_Rel *)shdr[i].sh_addr; ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; rl++; break; case SHT_RELA: if (shdr[shdr[i].sh_info].sh_addr == 0) break; ef->relatab[ra].rela = (Elf_Rela *)shdr[i].sh_addr; ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; ra++; break; } } if (pb != ef->nprogtab) { printf("%s: lost progbits\n", filename); error = ENOEXEC; goto out; } if (rl != ef->nreltab) { printf("%s: lost reltab\n", filename); error = ENOEXEC; goto out; } if (ra != ef->nrelatab) { printf("%s: lost relatab\n", filename); error = ENOEXEC; goto out; } /* * The file needs to be writeable and executable while applying * relocations. Mapping protections are applied once relocation * processing is complete. */ link_elf_protect_range(ef, (vm_offset_t)ef->address, round_page((vm_offset_t)ef->address + ef->lf.size), VM_PROT_ALL); /* Local intra-module relocations */ error = link_elf_reloc_local(lf, false); if (error != 0) goto out; *result = lf; return (0); out: /* preload not done this way */ linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } static void link_elf_invoke_cbs(caddr_t addr, size_t size) { void (**ctor)(void); size_t i, cnt; if (addr == NULL || size == 0) return; cnt = size / sizeof(*ctor); ctor = (void *)addr; for (i = 0; i < cnt; i++) { if (ctor[i] != NULL) (*ctor[i])(); } } static int link_elf_link_preload_finish(linker_file_t lf) { elf_file_t ef; int error; ef = (elf_file_t)lf; error = relocate_file(ef); if (error) return (error); /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) return (error); #if defined(__i386__) || defined(__amd64__) /* Now ifuncs. */ error = link_elf_reloc_local(lf, true); if (error != 0) return (error); #endif /* Apply protections now that relocation processing is complete. */ link_elf_protect(ef); link_elf_invoke_cbs(lf->ctors_addr, lf->ctors_size); return (0); } static int link_elf_load_file(linker_class_t cls, const char *filename, linker_file_t *result) { struct nameidata *nd; struct thread *td = curthread; /* XXX */ Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; int nbytes, i, j; vm_offset_t mapbase; size_t mapsize; int error = 0; ssize_t resid; int flags; elf_file_t ef; linker_file_t lf; int symtabindex; int symstrindex; int shstrindex; int nsym; int pb, rl, ra; int alignmask; shdr = NULL; lf = NULL; mapsize = 0; hdr = NULL; nd = malloc(sizeof(struct nameidata), M_TEMP, M_WAITOK); NDINIT(nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename); flags = FREAD; error = vn_open(nd, &flags, 0, NULL); if (error) { free(nd, M_TEMP); return error; } NDFREE_PNBUF(nd); if (nd->ni_vp->v_type != VREG) { error = ENOEXEC; goto out; } #ifdef MAC error = mac_kld_check_load(td->td_ucred, nd->ni_vp); if (error) { goto out; } #endif /* Read the elf header from the file. */ hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = ENOEXEC; goto out; } if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_REL) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } lf = linker_make_file(filename, &link_elf_class); if (!lf) { error = ENOMEM; goto out; } ef = (elf_file_t) lf; ef->nprogtab = 0; ef->e_shdr = 0; ef->nreltab = 0; ef->nrelatab = 0; /* Allocate and read in the section header */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } shdr = malloc(nbytes, M_LINKER, M_WAITOK); ef->e_shdr = shdr; error = vn_rdwr(UIO_READ, nd->ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid) { error = ENOEXEC; goto out; } /* Scan the section header for information and table sizing. */ nsym = 0; symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif case SHT_INIT_ARRAY: case SHT_FINI_ARRAY: if ((shdr[i].sh_flags & SHF_ALLOC) == 0) break; ef->nprogtab++; break; case SHT_SYMTAB: nsym++; symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: /* * Ignore relocation tables for unallocated * sections. */ if ((shdr[shdr[i].sh_info].sh_flags & SHF_ALLOC) == 0) break; ef->nreltab++; break; case SHT_RELA: if ((shdr[shdr[i].sh_info].sh_flags & SHF_ALLOC) == 0) break; ef->nrelatab++; break; case SHT_STRTAB: break; } } if (ef->nprogtab == 0) { link_elf_error(filename, "file has no contents"); error = ENOEXEC; goto out; } if (nsym != 1) { /* Only allow one symbol table for now */ link_elf_error(filename, "file must have exactly one symbol table"); error = ENOEXEC; goto out; } if (symstrindex < 0 || symstrindex > hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB) { link_elf_error(filename, "file has invalid symbol strings"); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if (symtabindex == -1) { link_elf_error(filename, "lost symbol table index"); error = ENOEXEC; goto out; } /* Allocate space for and load the symbol table */ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)ef->ddbsymtab, shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Allocate space for and load the symbol strings */ ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd->ni_vp, ef->ddbstrtab, shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Do we have a string table for the section names? */ shstrindex = -1; if (hdr->e_shstrndx != 0 && shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) { shstrindex = hdr->e_shstrndx; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd->ni_vp, ef->shstrtab, shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } } /* Size up code/data(progbits) and bss(nobits). */ alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif case SHT_INIT_ARRAY: case SHT_FINI_ARRAY: if ((shdr[i].sh_flags & SHF_ALLOC) == 0) break; alignmask = shdr[i].sh_addralign - 1; mapsize += alignmask; mapsize &= ~alignmask; mapsize += shdr[i].sh_size; break; } } /* * We know how much space we need for the text/data/bss/etc. * This stuff needs to be in a single chunk so that profiling etc * can get the bounds and gdb can associate offsets with modules */ ef->object = vm_pager_allocate(OBJT_PHYS, NULL, round_page(mapsize), VM_PROT_ALL, 0, thread0.td_ucred); if (ef->object == NULL) { error = ENOMEM; goto out; } #if VM_NRESERVLEVEL > 0 vm_object_color(ef->object, 0); #endif /* * In order to satisfy amd64's architectural requirements on the * location of code and data in the kernel's address space, request a * mapping that is above the kernel. * * Protections will be restricted once relocations are applied. */ #ifdef __amd64__ mapbase = KERNBASE; #else mapbase = VM_MIN_KERNEL_ADDRESS; #endif error = vm_map_find(kernel_map, ef->object, 0, &mapbase, round_page(mapsize), 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) { vm_object_deallocate(ef->object); ef->object = NULL; error = ENOMEM; goto out; } /* Wire the pages */ error = vm_map_wire(kernel_map, mapbase, mapbase + round_page(mapsize), VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } /* Inform the kld system about the situation */ lf->address = ef->address = (caddr_t)mapbase; lf->size = mapsize; /* * Now load code/data(progbits), zero bss(nobits), allocate space for * and load relocs */ pb = 0; rl = 0; ra = 0; alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif case SHT_INIT_ARRAY: case SHT_FINI_ARRAY: if ((shdr[i].sh_flags & SHF_ALLOC) == 0) break; alignmask = shdr[i].sh_addralign - 1; mapbase += alignmask; mapbase &= ~alignmask; if (ef->shstrtab != NULL && shdr[i].sh_name != 0) { ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; if (!strcmp(ef->progtab[pb].name, ".ctors") || shdr[i].sh_type == SHT_INIT_ARRAY) { if (lf->ctors_addr != 0) { printf( "%s: multiple ctor sections in %s\n", __func__, filename); } else { lf->ctors_addr = (caddr_t)mapbase; lf->ctors_size = shdr[i].sh_size; } } else if (!strcmp(ef->progtab[pb].name, ".dtors") || shdr[i].sh_type == SHT_FINI_ARRAY) { if (lf->dtors_addr != 0) { printf( "%s: multiple dtor sections in %s\n", __func__, filename); } else { lf->dtors_addr = (caddr_t)mapbase; lf->dtors_size = shdr[i].sh_size; } } } else if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<>"; #ifdef __amd64__ else if (shdr[i].sh_type == SHT_X86_64_UNWIND) ef->progtab[pb].name = "<>"; #endif else ef->progtab[pb].name = "<>"; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) { ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); if (ef->progtab[pb].addr == NULL) { printf("%s: pcpu module space is out " "of space; cannot allocate %#jx " "for %s\n", __func__, (uintmax_t)shdr[i].sh_size, filename); } } #ifdef VIMAGE else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) { ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); if (ef->progtab[pb].addr == NULL) { printf("%s: vnet module space is out " "of space; cannot allocate %#jx " "for %s\n", __func__, (uintmax_t)shdr[i].sh_size, filename); } } #endif else ef->progtab[pb].addr = (void *)(uintptr_t)mapbase; if (ef->progtab[pb].addr == NULL) { error = ENOSPC; goto out; } ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].flags = shdr[i].sh_flags; ef->progtab[pb].sec = i; if (shdr[i].sh_type == SHT_PROGBITS #ifdef __amd64__ || shdr[i].sh_type == SHT_X86_64_UNWIND #endif ) { error = vn_rdwr(UIO_READ, nd->ni_vp, ef->progtab[pb].addr, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Initialize the per-cpu area. */ if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) dpcpu_copy(ef->progtab[pb].addr, shdr[i].sh_size); } else bzero(ef->progtab[pb].addr, shdr[i].sh_size); +#ifdef VIMAGE + if (ef->progtab[pb].addr != (void *)mapbase && + strcmp(ef->progtab[pb].name, VNET_SETNAME) == 0) + vnet_save_init(ef->progtab[pb].addr, + ef->progtab[pb].size); +#endif /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr)ef->progtab[pb].addr; } mapbase += shdr[i].sh_size; pb++; break; case SHT_REL: if ((shdr[shdr[i].sh_info].sh_flags & SHF_ALLOC) == 0) break; ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)ef->reltab[rl].rel, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } rl++; break; case SHT_RELA: if ((shdr[shdr[i].sh_info].sh_flags & SHF_ALLOC) == 0) break; ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)ef->relatab[ra].rela, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } ra++; break; } } if (pb != ef->nprogtab) { link_elf_error(filename, "lost progbits"); error = ENOEXEC; goto out; } if (rl != ef->nreltab) { link_elf_error(filename, "lost reltab"); error = ENOEXEC; goto out; } if (ra != ef->nrelatab) { link_elf_error(filename, "lost relatab"); error = ENOEXEC; goto out; } if (mapbase != (vm_offset_t)ef->address + mapsize) { printf( "%s: mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n", filename != NULL ? filename : "", (u_long)mapbase, ef->address, (u_long)mapsize, (u_long)(vm_offset_t)ef->address + mapsize); error = ENOMEM; goto out; } /* Local intra-module relocations */ error = link_elf_reloc_local(lf, false); if (error != 0) goto out; /* Pull in dependencies */ VOP_UNLOCK(nd->ni_vp); error = linker_load_dependencies(lf); vn_lock(nd->ni_vp, LK_EXCLUSIVE | LK_RETRY); if (error) goto out; /* External relocations */ error = relocate_file(ef); if (error) goto out; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) goto out; #if defined(__i386__) || defined(__amd64__) /* Now ifuncs. */ error = link_elf_reloc_local(lf, true); if (error != 0) goto out; #endif link_elf_protect(ef); link_elf_invoke_cbs(lf->ctors_addr, lf->ctors_size); *result = lf; out: VOP_UNLOCK(nd->ni_vp); vn_close(nd->ni_vp, FREAD, td->td_ucred, td); free(nd, M_TEMP); if (error && lf) linker_file_unload(lf, LINKER_UNLOAD_FORCE); free(hdr, M_LINKER); return error; } static void link_elf_unload_file(linker_file_t file) { elf_file_t ef = (elf_file_t) file; u_int i; link_elf_invoke_cbs(file->dtors_addr, file->dtors_size); /* Notify MD code that a module is being unloaded. */ elf_cpu_unload_file(file); if (ef->progtab) { for (i = 0; i < ef->nprogtab; i++) { if (ef->progtab[i].size == 0) continue; if (ef->progtab[i].name == NULL) continue; if (!strcmp(ef->progtab[i].name, DPCPU_SETNAME)) dpcpu_free(ef->progtab[i].addr, ef->progtab[i].size); #ifdef VIMAGE else if (!strcmp(ef->progtab[i].name, VNET_SETNAME)) vnet_data_free(ef->progtab[i].addr, ef->progtab[i].size); #endif } } if (ef->preloaded) { free(ef->reltab, M_LINKER); free(ef->relatab, M_LINKER); free(ef->progtab, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); if (file->pathname != NULL) preload_delete_name(file->pathname); return; } for (i = 0; i < ef->nreltab; i++) free(ef->reltab[i].rel, M_LINKER); for (i = 0; i < ef->nrelatab; i++) free(ef->relatab[i].rela, M_LINKER); free(ef->reltab, M_LINKER); free(ef->relatab, M_LINKER); free(ef->progtab, M_LINKER); if (ef->object != NULL) vm_map_remove(kernel_map, (vm_offset_t)ef->address, (vm_offset_t)ef->address + ptoa(ef->object->size)); free(ef->e_shdr, M_LINKER); free(ef->ddbsymtab, M_LINKER); free(ef->ddbstrtab, M_LINKER); free(ef->shstrtab, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); } static const char * symbol_name(elf_file_t ef, Elf_Size r_info) { const Elf_Sym *ref; if (ELF_R_SYM(r_info)) { ref = ef->ddbsymtab + ELF_R_SYM(r_info); return ef->ddbstrtab + ref->st_name; } else return NULL; } static Elf_Addr findbase(elf_file_t ef, int sec) { int i; Elf_Addr base = 0; for (i = 0; i < ef->nprogtab; i++) { if (sec == ef->progtab[i].sec) { base = (Elf_Addr)ef->progtab[i].addr; break; } } return base; } static int relocate_file1(elf_file_t ef, bool ifuncs) { const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; const char *symname; const Elf_Sym *sym; int i; Elf_Size symidx; Elf_Addr base; /* Perform relocations without addend if there are any: */ for (i = 0; i < ef->nreltab; i++) { rel = ef->reltab[i].rel; if (rel == NULL) { link_elf_error(ef->lf.filename, "lost a reltab!"); return (ENOEXEC); } rellim = rel + ef->reltab[i].nrel; base = findbase(ef, ef->reltab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rel < rellim; rel++) { symidx = ELF_R_SYM(rel->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Local relocs are already done */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) continue; if ((ELF_ST_TYPE(sym->st_info) == STT_GNU_IFUNC || elf_is_ifunc_reloc(rel->r_info)) != ifuncs) continue; if (elf_reloc(&ef->lf, base, rel, ELF_RELOC_REL, elf_obj_lookup)) { symname = symbol_name(ef, rel->r_info); printf("link_elf_obj: symbol %s undefined\n", symname); return (ENOENT); } } } /* Perform relocations with addend if there are any: */ for (i = 0; i < ef->nrelatab; i++) { rela = ef->relatab[i].rela; if (rela == NULL) { link_elf_error(ef->lf.filename, "lost a relatab!"); return (ENOEXEC); } relalim = rela + ef->relatab[i].nrela; base = findbase(ef, ef->relatab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for relatab"); return (ENOEXEC); } for ( ; rela < relalim; rela++) { symidx = ELF_R_SYM(rela->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Local relocs are already done */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) continue; if ((ELF_ST_TYPE(sym->st_info) == STT_GNU_IFUNC || elf_is_ifunc_reloc(rela->r_info)) != ifuncs) continue; if (elf_reloc(&ef->lf, base, rela, ELF_RELOC_RELA, elf_obj_lookup)) { symname = symbol_name(ef, rela->r_info); printf("link_elf_obj: symbol %s undefined\n", symname); return (ENOENT); } } } /* * Only clean SHN_FBSD_CACHED for successful return. If we * modified symbol table for the object but found an * unresolved symbol, there is no reason to roll back. */ elf_obj_cleanup_globals_cache(ef); return (0); } static int relocate_file(elf_file_t ef) { int error; error = relocate_file1(ef, false); if (error == 0) error = relocate_file1(ef, true); return (error); } static int link_elf_lookup_symbol1(linker_file_t lf, const char *name, c_linker_sym_t *sym, bool see_local) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym *symp; const char *strp; int i; for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { strp = ef->ddbstrtab + symp->st_name; if (symp->st_shndx != SHN_UNDEF && strcmp(name, strp) == 0) { if (see_local || ELF_ST_BIND(symp->st_info) == STB_GLOBAL) { *sym = (c_linker_sym_t) symp; return (0); } return (ENOENT); } } return (ENOENT); } static int link_elf_lookup_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym) { return (link_elf_lookup_symbol1(lf, name, sym, link_elf_obj_leak_locals)); } static int link_elf_lookup_debug_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym) { return (link_elf_lookup_symbol1(lf, name, sym, true)); } static int link_elf_symbol_values1(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval, bool see_local) { elf_file_t ef; const Elf_Sym *es; caddr_t val; ef = (elf_file_t) lf; es = (const Elf_Sym*) sym; val = (caddr_t)es->st_value; if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) { if (!see_local && ELF_ST_BIND(es->st_info) == STB_LOCAL) return (ENOENT); symval->name = ef->ddbstrtab + es->st_name; val = (caddr_t)es->st_value; if (ELF_ST_TYPE(es->st_info) == STT_GNU_IFUNC) val = ((caddr_t (*)(void))val)(); symval->value = val; symval->size = es->st_size; return (0); } return (ENOENT); } static int link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval) { return (link_elf_symbol_values1(lf, sym, symval, link_elf_obj_leak_locals)); } static int link_elf_debug_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval) { return (link_elf_symbol_values1(lf, sym, symval, true)); } static int link_elf_search_symbol(linker_file_t lf, caddr_t value, c_linker_sym_t *sym, long *diffp) { elf_file_t ef = (elf_file_t)lf; u_long off = (uintptr_t)(void *)value; u_long diff = off; u_long st_value; const Elf_Sym *es; const Elf_Sym *best = NULL; int i; for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { if (es->st_name == 0) continue; st_value = es->st_value; if (off >= st_value) { if (off - st_value < diff) { diff = off - st_value; best = es; if (diff == 0) break; } else if (off - st_value == diff) { best = es; } } } if (best == NULL) *diffp = off; else *diffp = diff; *sym = (c_linker_sym_t) best; return (0); } /* * Look up a linker set on an ELF system. */ static int link_elf_lookup_set(linker_file_t lf, const char *name, void ***startp, void ***stopp, int *countp) { elf_file_t ef = (elf_file_t)lf; void **start, **stop; int i, count; /* Relative to section number */ for (i = 0; i < ef->nprogtab; i++) { if ((strncmp(ef->progtab[i].name, "set_", 4) == 0) && strcmp(ef->progtab[i].name + 4, name) == 0) { start = (void **)ef->progtab[i].addr; stop = (void **)((char *)ef->progtab[i].addr + ef->progtab[i].size); count = stop - start; if (startp) *startp = start; if (stopp) *stopp = stop; if (countp) *countp = count; return (0); } } return (ESRCH); } static int link_elf_each_function_name(linker_file_t file, int (*callback)(const char *, void *), void *opaque) { elf_file_t ef = (elf_file_t)file; const Elf_Sym *symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC)) { error = callback(ef->ddbstrtab + symp->st_name, opaque); if (error) return (error); } } return (0); } static int link_elf_each_function_nameval(linker_file_t file, linker_function_nameval_callback_t callback, void *opaque) { linker_symval_t symval; elf_file_t ef = (elf_file_t)file; const Elf_Sym *symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && (ELF_ST_TYPE(symp->st_info) == STT_FUNC || ELF_ST_TYPE(symp->st_info) == STT_GNU_IFUNC)) { error = link_elf_debug_symbol_values(file, (c_linker_sym_t)symp, &symval); if (error == 0) error = callback(file, i, &symval, opaque); if (error != 0) return (error); } } return (0); } static void elf_obj_cleanup_globals_cache(elf_file_t ef) { Elf_Sym *sym; Elf_Size i; for (i = 0; i < ef->ddbsymcnt; i++) { sym = ef->ddbsymtab + i; if (sym->st_shndx == SHN_FBSD_CACHED) { sym->st_shndx = SHN_UNDEF; sym->st_value = 0; } } } /* * Symbol lookup function that can be used when the symbol index is known (ie * in relocations). It uses the symbol index instead of doing a fully fledged * hash table based lookup when such is valid. For example for local symbols. * This is not only more efficient, it's also more correct. It's not always * the case that the symbol can be found through the hash table. */ static int elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res) { elf_file_t ef = (elf_file_t)lf; Elf_Sym *sym; const char *symbol; Elf_Addr res1; /* Don't even try to lookup the symbol if the index is bogus. */ if (symidx >= ef->ddbsymcnt) { *res = 0; return (EINVAL); } sym = ef->ddbsymtab + symidx; /* Quick answer if there is a definition included. */ if (sym->st_shndx != SHN_UNDEF) { res1 = (Elf_Addr)sym->st_value; if (ELF_ST_TYPE(sym->st_info) == STT_GNU_IFUNC) res1 = ((Elf_Addr (*)(void))res1)(); *res = res1; return (0); } /* If we get here, then it is undefined and needs a lookup. */ switch (ELF_ST_BIND(sym->st_info)) { case STB_LOCAL: /* Local, but undefined? huh? */ *res = 0; return (EINVAL); case STB_GLOBAL: case STB_WEAK: /* Relative to Data or Function name */ symbol = ef->ddbstrtab + sym->st_name; /* Force a lookup failure if the symbol name is bogus. */ if (*symbol == 0) { *res = 0; return (EINVAL); } res1 = (Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps); /* * Cache global lookups during module relocation. The failure * case is particularly expensive for callers, who must scan * through the entire globals table doing strcmp(). Cache to * avoid doing such work repeatedly. * * After relocation is complete, undefined globals will be * restored to SHN_UNDEF in elf_obj_cleanup_globals_cache(), * above. */ if (res1 != 0) { sym->st_shndx = SHN_FBSD_CACHED; sym->st_value = res1; *res = res1; return (0); } else if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { sym->st_value = 0; *res = 0; return (0); } return (EINVAL); default: return (EINVAL); } } static void link_elf_fix_link_set(elf_file_t ef) { static const char startn[] = "__start_"; static const char stopn[] = "__stop_"; Elf_Sym *sym; const char *sym_name, *linkset_name; Elf_Addr startp, stopp; Elf_Size symidx; int start, i; startp = stopp = 0; for (symidx = 1 /* zero entry is special */; symidx < ef->ddbsymcnt; symidx++) { sym = ef->ddbsymtab + symidx; if (sym->st_shndx != SHN_UNDEF) continue; sym_name = ef->ddbstrtab + sym->st_name; if (strncmp(sym_name, startn, sizeof(startn) - 1) == 0) { start = 1; linkset_name = sym_name + sizeof(startn) - 1; } else if (strncmp(sym_name, stopn, sizeof(stopn) - 1) == 0) { start = 0; linkset_name = sym_name + sizeof(stopn) - 1; } else continue; for (i = 0; i < ef->nprogtab; i++) { if (strcmp(ef->progtab[i].name, linkset_name) == 0) { startp = (Elf_Addr)ef->progtab[i].addr; stopp = (Elf_Addr)(startp + ef->progtab[i].size); break; } } if (i == ef->nprogtab) continue; sym->st_value = start ? startp : stopp; sym->st_shndx = i; } } static int link_elf_reloc_local(linker_file_t lf, bool ifuncs) { elf_file_t ef = (elf_file_t)lf; const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; const Elf_Sym *sym; Elf_Addr base; int i; Elf_Size symidx; link_elf_fix_link_set(ef); /* Perform relocations without addend if there are any: */ for (i = 0; i < ef->nreltab; i++) { rel = ef->reltab[i].rel; if (rel == NULL) { link_elf_error(ef->lf.filename, "lost a reltab"); return (ENOEXEC); } rellim = rel + ef->reltab[i].nrel; base = findbase(ef, ef->reltab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rel < rellim; rel++) { symidx = ELF_R_SYM(rel->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Only do local relocs */ if (ELF_ST_BIND(sym->st_info) != STB_LOCAL) continue; if ((ELF_ST_TYPE(sym->st_info) == STT_GNU_IFUNC || elf_is_ifunc_reloc(rel->r_info)) != ifuncs) continue; if (elf_reloc_local(lf, base, rel, ELF_RELOC_REL, elf_obj_lookup) != 0) return (ENOEXEC); } } /* Perform relocations with addend if there are any: */ for (i = 0; i < ef->nrelatab; i++) { rela = ef->relatab[i].rela; if (rela == NULL) { link_elf_error(ef->lf.filename, "lost a relatab!"); return (ENOEXEC); } relalim = rela + ef->relatab[i].nrela; base = findbase(ef, ef->relatab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rela < relalim; rela++) { symidx = ELF_R_SYM(rela->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Only do local relocs */ if (ELF_ST_BIND(sym->st_info) != STB_LOCAL) continue; if ((ELF_ST_TYPE(sym->st_info) == STT_GNU_IFUNC || elf_is_ifunc_reloc(rela->r_info)) != ifuncs) continue; if (elf_reloc_local(lf, base, rela, ELF_RELOC_RELA, elf_obj_lookup) != 0) return (ENOEXEC); } } return (0); } static long link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab) { elf_file_t ef = (elf_file_t)lf; *symtab = ef->ddbsymtab; if (*symtab == NULL) return (0); return (ef->ddbsymcnt); } static long link_elf_strtab_get(linker_file_t lf, caddr_t *strtab) { elf_file_t ef = (elf_file_t)lf; *strtab = ef->ddbstrtab; if (*strtab == NULL) return (0); return (ef->ddbstrcnt); } #ifdef VIMAGE static void link_elf_propagate_vnets(linker_file_t lf) { elf_file_t ef = (elf_file_t) lf; if (ef->progtab) { for (int i = 0; i < ef->nprogtab; i++) { if (ef->progtab[i].size == 0) continue; if (ef->progtab[i].name == NULL) continue; if (strcmp(ef->progtab[i].name, VNET_SETNAME) == 0) { vnet_data_copy(ef->progtab[i].addr, ef->progtab[i].size); break; } } } } #endif diff --git a/sys/net/vnet.c b/sys/net/vnet.c index c4a623698341..ac937125a19d 100644 --- a/sys/net/vnet.c +++ b/sys/net/vnet.c @@ -1,814 +1,847 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2004-2009 University of Zagreb * Copyright (c) 2006-2009 FreeBSD Foundation * All rights reserved. * * This software was developed by the University of Zagreb and the * FreeBSD Foundation under sponsorship by the Stichting NLnet and the * FreeBSD Foundation. * * Copyright (c) 2009 Jeffrey Roberson * Copyright (c) 2009 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ddb.h" #include "opt_kdb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include /*- * This file implements core functions for virtual network stacks: * * - Virtual network stack management functions. * * - Virtual network stack memory allocator, which virtualizes global * variables in the network stack * * - Virtualized SYSINIT's/SYSUNINIT's, which allow network stack subsystems * to register startup/shutdown events to be run for each virtual network * stack instance. */ FEATURE(vimage, "VIMAGE kernel virtualization"); static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); /* * The virtual network stack list has two read-write locks, one sleepable and * the other not, so that the list can be stablized and walked in a variety * of network stack contexts. Both must be acquired exclusively to modify * the list, but a read lock of either lock is sufficient to walk the list. */ struct rwlock vnet_rwlock; struct sx vnet_sxlock; #define VNET_LIST_WLOCK() do { \ sx_xlock(&vnet_sxlock); \ rw_wlock(&vnet_rwlock); \ } while (0) #define VNET_LIST_WUNLOCK() do { \ rw_wunlock(&vnet_rwlock); \ sx_xunlock(&vnet_sxlock); \ } while (0) struct vnet_list_head vnet_head; struct vnet *vnet0; /* * The virtual network stack allocator provides storage for virtualized * global variables. These variables are defined/declared using the * VNET_DEFINE()/VNET_DECLARE() macros, which place them in the 'set_vnet' * linker set. The details of the implementation are somewhat subtle, but * allow the majority of most network subsystems to maintain * virtualization-agnostic. * * The virtual network stack allocator handles variables in the base kernel * vs. modules in similar but different ways. In both cases, virtualized * global variables are marked as such by being declared to be part of the * vnet linker set. These "master" copies of global variables serve two * functions: * * (1) They contain static initialization or "default" values for global * variables which will be propagated to each virtual network stack * instance when created. As with normal global variables, they default * to zero-filled. * * (2) They act as unique global names by which the variable can be referred * to, regardless of network stack instance. The single global symbol * will be used to calculate the location of a per-virtual instance * variable at run-time. * * Each virtual network stack instance has a complete copy of each * virtualized global variable, stored in a malloc'd block of memory * referred to by vnet->vnet_data_mem. Critical to the design is that each * per-instance memory block is laid out identically to the master block so * that the offset of each global variable is the same across all blocks. To * optimize run-time access, a precalculated 'base' address, * vnet->vnet_data_base, is stored in each vnet, and is the amount that can * be added to the address of a 'master' instance of a variable to get to the * per-vnet instance. * * Virtualized global variables are handled in a similar manner, but as each * module has its own 'set_vnet' linker set, and we want to keep all * virtualized globals togther, we reserve space in the kernel's linker set * for potential module variables using a per-vnet character array, * 'modspace'. The virtual network stack allocator maintains a free list to * track what space in the array is free (all, initially) and as modules are * linked, allocates portions of the space to specific globals. The kernel * module linker queries the virtual network stack allocator and will * bind references of the global to the location during linking. It also * calls into the virtual network stack allocator, once the memory is * initialized, in order to propagate the new static initializations to all * existing virtual network stack instances so that the soon-to-be executing * module will find every network stack instance with proper default values. */ /* * Number of bytes of data in the 'set_vnet' linker set, and hence the total * size of all kernel virtualized global variables, and the malloc(9) type * that will be used to allocate it. */ #define VNET_BYTES (VNET_STOP - VNET_START) static MALLOC_DEFINE(M_VNET_DATA, "vnet_data", "VNET data"); /* * VNET_MODMIN is the minimum number of bytes we will reserve for the sum of * global variables across all loaded modules. As this actually sizes an * array declared as a virtualized global variable in the kernel itself, and * we want the virtualized global variable space to be page-sized, we may * have more space than that in practice. */ #define VNET_MODMIN (8 * PAGE_SIZE) #define VNET_SIZE roundup2(VNET_BYTES, PAGE_SIZE) /* * Space to store virtualized global variables from loadable kernel modules, * and the free list to manage it. */ VNET_DEFINE_STATIC(char, modspace[VNET_MODMIN] __aligned(__alignof(void *))); +/* + * A copy of the initial values of all virtualized global variables. + */ +static uintptr_t vnet_init_var; + /* * Global lists of subsystem constructor and destructors for vnets. They are * registered via VNET_SYSINIT() and VNET_SYSUNINIT(). Both lists are * protected by the vnet_sysinit_sxlock global lock. */ static TAILQ_HEAD(vnet_sysinit_head, vnet_sysinit) vnet_constructors = TAILQ_HEAD_INITIALIZER(vnet_constructors); static TAILQ_HEAD(vnet_sysuninit_head, vnet_sysinit) vnet_destructors = TAILQ_HEAD_INITIALIZER(vnet_destructors); struct sx vnet_sysinit_sxlock; #define VNET_SYSINIT_WLOCK() sx_xlock(&vnet_sysinit_sxlock); #define VNET_SYSINIT_WUNLOCK() sx_xunlock(&vnet_sysinit_sxlock); #define VNET_SYSINIT_RLOCK() sx_slock(&vnet_sysinit_sxlock); #define VNET_SYSINIT_RUNLOCK() sx_sunlock(&vnet_sysinit_sxlock); struct vnet_data_free { uintptr_t vnd_start; int vnd_len; TAILQ_ENTRY(vnet_data_free) vnd_link; }; static MALLOC_DEFINE(M_VNET_DATA_FREE, "vnet_data_free", "VNET resource accounting"); static TAILQ_HEAD(, vnet_data_free) vnet_data_free_head = TAILQ_HEAD_INITIALIZER(vnet_data_free_head); static struct sx vnet_data_free_lock; SDT_PROVIDER_DEFINE(vnet); SDT_PROBE_DEFINE1(vnet, functions, vnet_alloc, entry, "int"); SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, alloc, "int", "struct vnet *"); SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, return, "int", "struct vnet *"); SDT_PROBE_DEFINE2(vnet, functions, vnet_destroy, entry, "int", "struct vnet *"); SDT_PROBE_DEFINE1(vnet, functions, vnet_destroy, return, "int"); /* * Run per-vnet sysinits or sysuninits during vnet creation/destruction. */ static void vnet_sysinit(void); static void vnet_sysuninit(void); #ifdef DDB static void db_show_vnet_print_vs(struct vnet_sysinit *, int); #endif /* * Allocate a virtual network stack. */ struct vnet * vnet_alloc(void) { struct vnet *vnet; SDT_PROBE1(vnet, functions, vnet_alloc, entry, __LINE__); vnet = malloc(sizeof(struct vnet), M_VNET, M_WAITOK | M_ZERO); vnet->vnet_magic_n = VNET_MAGIC_N; SDT_PROBE2(vnet, functions, vnet_alloc, alloc, __LINE__, vnet); /* * Allocate storage for virtualized global variables and copy in * initial values from our 'master' copy. */ vnet->vnet_data_mem = malloc(VNET_SIZE, M_VNET_DATA, M_WAITOK); memcpy(vnet->vnet_data_mem, (void *)VNET_START, VNET_BYTES); /* * All use of vnet-specific data will immediately subtract VNET_START * from the base memory pointer, so pre-calculate that now to avoid * it on each use. */ vnet->vnet_data_base = (uintptr_t)vnet->vnet_data_mem - VNET_START; /* Initialize / attach vnet module instances. */ CURVNET_SET_QUIET(vnet); vnet_sysinit(); CURVNET_RESTORE(); VNET_LIST_WLOCK(); LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le); VNET_LIST_WUNLOCK(); SDT_PROBE2(vnet, functions, vnet_alloc, return, __LINE__, vnet); return (vnet); } /* * Destroy a virtual network stack. */ void vnet_destroy(struct vnet *vnet) { SDT_PROBE2(vnet, functions, vnet_destroy, entry, __LINE__, vnet); KASSERT(vnet->vnet_sockcnt == 0, ("%s: vnet still has sockets", __func__)); VNET_LIST_WLOCK(); LIST_REMOVE(vnet, vnet_le); VNET_LIST_WUNLOCK(); /* Signal that VNET is being shutdown. */ vnet->vnet_shutdown = true; CURVNET_SET_QUIET(vnet); sx_xlock(&ifnet_detach_sxlock); vnet_sysuninit(); sx_xunlock(&ifnet_detach_sxlock); CURVNET_RESTORE(); /* * Release storage for the virtual network stack instance. */ free(vnet->vnet_data_mem, M_VNET_DATA); vnet->vnet_data_mem = NULL; vnet->vnet_data_base = 0; vnet->vnet_magic_n = 0xdeadbeef; free(vnet, M_VNET); SDT_PROBE1(vnet, functions, vnet_destroy, return, __LINE__); } /* * Boot time initialization and allocation of virtual network stacks. */ static void vnet_init_prelink(void *arg __unused) { rw_init(&vnet_rwlock, "vnet_rwlock"); sx_init(&vnet_sxlock, "vnet_sxlock"); sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock"); LIST_INIT(&vnet_head); } SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST, vnet_init_prelink, NULL); static void vnet0_init(void *arg __unused) { if (bootverbose) printf("VIMAGE (virtualized network stack) enabled\n"); /* * We MUST clear curvnet in vi_init_done() before going SMP, * otherwise CURVNET_SET() macros would scream about unnecessary * curvnet recursions. */ curvnet = prison0.pr_vnet = vnet0 = vnet_alloc(); } SYSINIT(vnet0_init, SI_SUB_VNET, SI_ORDER_FIRST, vnet0_init, NULL); static void vnet_init_done(void *unused __unused) { curvnet = NULL; } SYSINIT(vnet_init_done, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_init_done, NULL); /* * Once on boot, initialize the modspace freelist to entirely cover modspace. */ static void vnet_data_startup(void *dummy __unused) { struct vnet_data_free *df; df = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO); df->vnd_start = (uintptr_t)&VNET_NAME(modspace); df->vnd_len = VNET_MODMIN; TAILQ_INSERT_HEAD(&vnet_data_free_head, df, vnd_link); sx_init(&vnet_data_free_lock, "vnet_data alloc lock"); + vnet_init_var = (uintptr_t)malloc(VNET_BYTES, M_VNET_DATA, M_WAITOK); } SYSINIT(vnet_data, SI_SUB_KLD, SI_ORDER_FIRST, vnet_data_startup, NULL); /* Dummy VNET_SYSINIT to make sure we always reach the final end state. */ static void vnet_sysinit_done(void *unused __unused) { return; } VNET_SYSINIT(vnet_sysinit_done, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_sysinit_done, NULL); /* * When a module is loaded and requires storage for a virtualized global * variable, allocate space from the modspace free list. This interface * should be used only by the kernel linker. */ void * vnet_data_alloc(int size) { struct vnet_data_free *df; void *s; s = NULL; size = roundup2(size, sizeof(void *)); sx_xlock(&vnet_data_free_lock); TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) { if (df->vnd_len < size) continue; if (df->vnd_len == size) { s = (void *)df->vnd_start; TAILQ_REMOVE(&vnet_data_free_head, df, vnd_link); free(df, M_VNET_DATA_FREE); break; } s = (void *)df->vnd_start; df->vnd_len -= size; df->vnd_start = df->vnd_start + size; break; } sx_xunlock(&vnet_data_free_lock); return (s); } /* * Free space for a virtualized global variable on module unload. */ void vnet_data_free(void *start_arg, int size) { struct vnet_data_free *df; struct vnet_data_free *dn; uintptr_t start; uintptr_t end; size = roundup2(size, sizeof(void *)); start = (uintptr_t)start_arg; end = start + size; /* * Free a region of space and merge it with as many neighbors as * possible. Keeping the list sorted simplifies this operation. */ sx_xlock(&vnet_data_free_lock); TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) { if (df->vnd_start > end) break; /* * If we expand at the end of an entry we may have to merge * it with the one following it as well. */ if (df->vnd_start + df->vnd_len == start) { df->vnd_len += size; dn = TAILQ_NEXT(df, vnd_link); if (df->vnd_start + df->vnd_len == dn->vnd_start) { df->vnd_len += dn->vnd_len; TAILQ_REMOVE(&vnet_data_free_head, dn, vnd_link); free(dn, M_VNET_DATA_FREE); } sx_xunlock(&vnet_data_free_lock); return; } if (df->vnd_start == end) { df->vnd_start = start; df->vnd_len += size; sx_xunlock(&vnet_data_free_lock); return; } } dn = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO); dn->vnd_start = start; dn->vnd_len = size; if (df) TAILQ_INSERT_BEFORE(df, dn, vnd_link); else TAILQ_INSERT_TAIL(&vnet_data_free_head, dn, vnd_link); sx_xunlock(&vnet_data_free_lock); } /* * When a new virtualized global variable has been allocated, propagate its * initial value to each already-allocated virtual network stack instance. */ void vnet_data_copy(void *start, int size) { struct vnet *vnet; VNET_LIST_RLOCK(); LIST_FOREACH(vnet, &vnet_head, vnet_le) memcpy((void *)((uintptr_t)vnet->vnet_data_base + (uintptr_t)start), start, size); VNET_LIST_RUNLOCK(); } +/* + * Save a copy of the initial values of virtualized global variables. + */ +void +vnet_save_init(void *start, size_t size) +{ + MPASS(vnet_init_var != 0); + MPASS(VNET_START <= (uintptr_t)start && + (uintptr_t)start + size <= VNET_STOP); + memcpy((void *)(vnet_init_var + ((uintptr_t)start - VNET_START)), + start, size); +} + +/* + * Restore the 'master' copies of virtualized global variables to theirs + * initial values. + */ +void +vnet_restore_init(void *start, size_t size) +{ + MPASS(vnet_init_var != 0); + MPASS(VNET_START <= (uintptr_t)start && + (uintptr_t)start + size <= VNET_STOP); + memcpy(start, + (void *)(vnet_init_var + ((uintptr_t)start - VNET_START)), size); +} + /* * Support for special SYSINIT handlers registered via VNET_SYSINIT() * and VNET_SYSUNINIT(). */ void vnet_register_sysinit(void *arg) { struct vnet_sysinit *vs, *vs2; struct vnet *vnet; vs = arg; KASSERT(vs->subsystem > SI_SUB_VNET, ("vnet sysinit too early")); /* Add the constructor to the global list of vnet constructors. */ VNET_SYSINIT_WLOCK(); TAILQ_FOREACH(vs2, &vnet_constructors, link) { if (vs2->subsystem > vs->subsystem) break; if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) break; } if (vs2 != NULL) TAILQ_INSERT_BEFORE(vs2, vs, link); else TAILQ_INSERT_TAIL(&vnet_constructors, vs, link); /* * Invoke the constructor on all the existing vnets when it is * registered. */ VNET_FOREACH(vnet) { CURVNET_SET_QUIET(vnet); vs->func(vs->arg); CURVNET_RESTORE(); } VNET_SYSINIT_WUNLOCK(); } void vnet_deregister_sysinit(void *arg) { struct vnet_sysinit *vs; vs = arg; /* Remove the constructor from the global list of vnet constructors. */ VNET_SYSINIT_WLOCK(); TAILQ_REMOVE(&vnet_constructors, vs, link); VNET_SYSINIT_WUNLOCK(); } void vnet_register_sysuninit(void *arg) { struct vnet_sysinit *vs, *vs2; vs = arg; /* Add the destructor to the global list of vnet destructors. */ VNET_SYSINIT_WLOCK(); TAILQ_FOREACH(vs2, &vnet_destructors, link) { if (vs2->subsystem > vs->subsystem) break; if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) break; } if (vs2 != NULL) TAILQ_INSERT_BEFORE(vs2, vs, link); else TAILQ_INSERT_TAIL(&vnet_destructors, vs, link); VNET_SYSINIT_WUNLOCK(); } void vnet_deregister_sysuninit(void *arg) { struct vnet_sysinit *vs; struct vnet *vnet; vs = arg; /* * Invoke the destructor on all the existing vnets when it is * deregistered. */ VNET_SYSINIT_WLOCK(); VNET_FOREACH(vnet) { CURVNET_SET_QUIET(vnet); vs->func(vs->arg); CURVNET_RESTORE(); } /* Remove the destructor from the global list of vnet destructors. */ TAILQ_REMOVE(&vnet_destructors, vs, link); VNET_SYSINIT_WUNLOCK(); } /* * Invoke all registered vnet constructors on the current vnet. Used during * vnet construction. The caller is responsible for ensuring the new vnet is * the current vnet and that the vnet_sysinit_sxlock lock is locked. */ static void vnet_sysinit(void) { struct vnet_sysinit *vs; VNET_SYSINIT_RLOCK(); TAILQ_FOREACH(vs, &vnet_constructors, link) { curvnet->vnet_state = vs->subsystem; vs->func(vs->arg); } VNET_SYSINIT_RUNLOCK(); } /* * Invoke all registered vnet destructors on the current vnet. Used during * vnet destruction. The caller is responsible for ensuring the dying vnet * the current vnet and that the vnet_sysinit_sxlock lock is locked. */ static void vnet_sysuninit(void) { struct vnet_sysinit *vs; VNET_SYSINIT_RLOCK(); TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head, link) { curvnet->vnet_state = vs->subsystem; vs->func(vs->arg); } VNET_SYSINIT_RUNLOCK(); } /* * EVENTHANDLER(9) extensions. */ /* * Invoke the eventhandler function originally registered with the possibly * registered argument for all virtual network stack instances. * * This iterator can only be used for eventhandlers that do not take any * additional arguments, as we do ignore the variadic arguments from the * EVENTHANDLER_INVOKE() call. */ void vnet_global_eventhandler_iterator_func(void *arg, ...) { VNET_ITERATOR_DECL(vnet_iter); struct eventhandler_entry_vimage *v_ee; /* * There is a bug here in that we should actually cast things to * (struct eventhandler_entry_ ## name *) but that's not easily * possible in here so just re-using the variadic version we * defined for the generic vimage case. */ v_ee = arg; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } #ifdef VNET_DEBUG struct vnet_recursion { SLIST_ENTRY(vnet_recursion) vnr_le; const char *prev_fn; const char *where_fn; int where_line; struct vnet *old_vnet; struct vnet *new_vnet; }; static SLIST_HEAD(, vnet_recursion) vnet_recursions = SLIST_HEAD_INITIALIZER(vnet_recursions); static void vnet_print_recursion(struct vnet_recursion *vnr, int brief) { if (!brief) printf("CURVNET_SET() recursion in "); printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line, vnr->prev_fn); if (brief) printf(", "); else printf("\n "); printf("%p -> %p\n", vnr->old_vnet, vnr->new_vnet); } void vnet_log_recursion(struct vnet *old_vnet, const char *old_fn, int line) { struct vnet_recursion *vnr; /* Skip already logged recursion events. */ SLIST_FOREACH(vnr, &vnet_recursions, vnr_le) if (vnr->prev_fn == old_fn && vnr->where_fn == curthread->td_vnet_lpush && vnr->where_line == line && (vnr->old_vnet == vnr->new_vnet) == (curvnet == old_vnet)) return; vnr = malloc(sizeof(*vnr), M_VNET, M_NOWAIT | M_ZERO); if (vnr == NULL) panic("%s: malloc failed", __func__); vnr->prev_fn = old_fn; vnr->where_fn = curthread->td_vnet_lpush; vnr->where_line = line; vnr->old_vnet = old_vnet; vnr->new_vnet = curvnet; SLIST_INSERT_HEAD(&vnet_recursions, vnr, vnr_le); vnet_print_recursion(vnr, 0); #ifdef KDB kdb_backtrace(); #endif } #endif /* VNET_DEBUG */ /* * DDB(4). */ #ifdef DDB static void db_vnet_print(struct vnet *vnet) { db_printf("vnet = %p\n", vnet); db_printf(" vnet_magic_n = %#08x (%s, orig %#08x)\n", vnet->vnet_magic_n, (vnet->vnet_magic_n == VNET_MAGIC_N) ? "ok" : "mismatch", VNET_MAGIC_N); db_printf(" vnet_ifcnt = %u\n", vnet->vnet_ifcnt); db_printf(" vnet_sockcnt = %u\n", vnet->vnet_sockcnt); db_printf(" vnet_data_mem = %p\n", vnet->vnet_data_mem); db_printf(" vnet_data_base = %#jx\n", (uintmax_t)vnet->vnet_data_base); db_printf(" vnet_state = %#08x\n", vnet->vnet_state); db_printf(" vnet_shutdown = %#03x\n", vnet->vnet_shutdown); db_printf("\n"); } DB_SHOW_ALL_COMMAND(vnets, db_show_all_vnets) { VNET_ITERATOR_DECL(vnet_iter); VNET_FOREACH(vnet_iter) { db_vnet_print(vnet_iter); if (db_pager_quit) break; } } DB_SHOW_COMMAND(vnet, db_show_vnet) { if (!have_addr) { db_printf("usage: show vnet \n"); return; } db_vnet_print((struct vnet *)addr); } static void db_show_vnet_print_vs(struct vnet_sysinit *vs, int ddb) { const char *vsname, *funcname; c_db_sym_t sym; db_expr_t offset; #define xprint(...) \ if (ddb) \ db_printf(__VA_ARGS__); \ else \ printf(__VA_ARGS__) if (vs == NULL) { xprint("%s: no vnet_sysinit * given\n", __func__); return; } sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset); db_symbol_values(sym, &vsname, NULL); sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset); db_symbol_values(sym, &funcname, NULL); xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs); xprint(" %#08x %#08x\n", vs->subsystem, vs->order); xprint(" %p(%s)(%p)\n", vs->func, (funcname != NULL) ? funcname : "", vs->arg); #undef xprint } DB_SHOW_COMMAND_FLAGS(vnet_sysinit, db_show_vnet_sysinit, DB_CMD_MEMSAFE) { struct vnet_sysinit *vs; db_printf("VNET_SYSINIT vs Name(Ptr)\n"); db_printf(" Subsystem Order\n"); db_printf(" Function(Name)(Arg)\n"); TAILQ_FOREACH(vs, &vnet_constructors, link) { db_show_vnet_print_vs(vs, 1); if (db_pager_quit) break; } } DB_SHOW_COMMAND_FLAGS(vnet_sysuninit, db_show_vnet_sysuninit, DB_CMD_MEMSAFE) { struct vnet_sysinit *vs; db_printf("VNET_SYSUNINIT vs Name(Ptr)\n"); db_printf(" Subsystem Order\n"); db_printf(" Function(Name)(Arg)\n"); TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head, link) { db_show_vnet_print_vs(vs, 1); if (db_pager_quit) break; } } #ifdef VNET_DEBUG DB_SHOW_COMMAND_FLAGS(vnetrcrs, db_show_vnetrcrs, DB_CMD_MEMSAFE) { struct vnet_recursion *vnr; SLIST_FOREACH(vnr, &vnet_recursions, vnr_le) vnet_print_recursion(vnr, 1); } #endif #endif /* DDB */ diff --git a/sys/net/vnet.h b/sys/net/vnet.h index 1d37fe85eec3..5485889ceaa7 100644 --- a/sys/net/vnet.h +++ b/sys/net/vnet.h @@ -1,455 +1,461 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2006-2009 University of Zagreb * Copyright (c) 2006-2009 FreeBSD Foundation * All rights reserved. * * This software was developed by the University of Zagreb and the * FreeBSD Foundation under sponsorship by the Stichting NLnet and the * FreeBSD Foundation. * * Copyright (c) 2009 Jeffrey Roberson * Copyright (c) 2009 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * This header file defines several sets of interfaces supporting virtualized * network stacks: * * - Definition of 'struct vnet' and functions and macros to allocate/free/ * manipulate it. * * - A virtual network stack memory allocator, which provides support for * virtualized global variables via a special linker set, set_vnet. * * - Virtualized sysinits/sysuninits, which allow constructors and * destructors to be run for each network stack subsystem as virtual * instances are created and destroyed. * * If VIMAGE isn't compiled into the kernel, virtualized global variables * compile to normal global variables, and virtualized sysinits to regular * sysinits. */ #ifndef _NET_VNET_H_ #define _NET_VNET_H_ /* * struct vnet describes a virtualized network stack, and is primarily a * pointer to storage for virtualized global variables. Expose to userspace * as required for libkvm. */ #if defined(_KERNEL) || defined(_WANT_VNET) #include /* for CACHE_LINE_SIZE */ #include struct vnet { LIST_ENTRY(vnet) vnet_le; /* all vnets list */ u_int vnet_magic_n; u_int vnet_ifcnt; u_int vnet_sockcnt; u_int vnet_state; /* SI_SUB_* */ void *vnet_data_mem; uintptr_t vnet_data_base; bool vnet_shutdown; /* Shutdown in progress. */ } __aligned(CACHE_LINE_SIZE); #define VNET_MAGIC_N 0x5e4a6f28 /* * These two virtual network stack allocator definitions are also required * for libkvm so that it can evaluate virtualized global variables. */ #define VNET_SETNAME "set_vnet" #define VNET_SYMPREFIX "vnet_entry_" #endif #ifdef _KERNEL #define VNET_PCPUSTAT_DECLARE(type, name) \ VNET_DECLARE(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)]) #define VNET_PCPUSTAT_DEFINE(type, name) \ VNET_DEFINE(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)]) #define VNET_PCPUSTAT_DEFINE_STATIC(type, name) \ VNET_DEFINE_STATIC(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)]) #define VNET_PCPUSTAT_ALLOC(name, wait) \ COUNTER_ARRAY_ALLOC(VNET(name), \ sizeof(VNET(name)) / sizeof(counter_u64_t), (wait)) #define VNET_PCPUSTAT_FREE(name) \ COUNTER_ARRAY_FREE(VNET(name), sizeof(VNET(name)) / sizeof(counter_u64_t)) #define VNET_PCPUSTAT_ADD(type, name, f, v) \ counter_u64_add(VNET(name)[offsetof(type, f) / sizeof(uint64_t)], (v)) #define VNET_PCPUSTAT_FETCH(type, name, f) \ counter_u64_fetch(VNET(name)[offsetof(type, f) / sizeof(uint64_t)]) #define VNET_PCPUSTAT_SYSINIT(name) \ static void \ vnet_##name##_init(const void *unused) \ { \ VNET_PCPUSTAT_ALLOC(name, M_WAITOK); \ } \ VNET_SYSINIT(vnet_ ## name ## _init, SI_SUB_INIT_IF, \ SI_ORDER_FIRST, vnet_ ## name ## _init, NULL) #define VNET_PCPUSTAT_SYSUNINIT(name) \ static void \ vnet_##name##_uninit(const void *unused) \ { \ VNET_PCPUSTAT_FREE(name); \ } \ VNET_SYSUNINIT(vnet_ ## name ## _uninit, SI_SUB_INIT_IF, \ SI_ORDER_FIRST, vnet_ ## name ## _uninit, NULL) #ifdef SYSCTL_OID #define SYSCTL_VNET_PCPUSTAT(parent, nbr, name, type, array, desc) \ static int \ array##_sysctl(SYSCTL_HANDLER_ARGS) \ { \ type s; \ CTASSERT((sizeof(type) / sizeof(uint64_t)) == \ (sizeof(VNET(array)) / sizeof(counter_u64_t))); \ COUNTER_ARRAY_COPY(VNET(array), &s, sizeof(type) / sizeof(uint64_t));\ if (req->newptr) \ COUNTER_ARRAY_ZERO(VNET(array), \ sizeof(type) / sizeof(uint64_t)); \ return (SYSCTL_OUT(req, &s, sizeof(type))); \ } \ SYSCTL_PROC(parent, nbr, name, \ CTLFLAG_VNET | CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_NEEDGIANT, \ NULL, 0, array ## _sysctl, "I", desc) #endif /* SYSCTL_OID */ #ifdef VIMAGE #include #include /* for struct thread */ #include #include /* * Location of the kernel's 'set_vnet' linker set. */ extern uintptr_t *__start_set_vnet; __GLOBL(__start_set_vnet); extern uintptr_t *__stop_set_vnet; __GLOBL(__stop_set_vnet); #define VNET_START (uintptr_t)&__start_set_vnet #define VNET_STOP (uintptr_t)&__stop_set_vnet /* * Functions to allocate and destroy virtual network stacks. */ struct vnet *vnet_alloc(void); void vnet_destroy(struct vnet *vnet); /* * The current virtual network stack -- we may wish to move this to struct * pcpu in the future. */ #define curvnet curthread->td_vnet /* * Various macros -- get and set the current network stack, but also * assertions. */ #if defined(INVARIANTS) || defined(VNET_DEBUG) #define VNET_ASSERT(exp, msg) do { \ if (!(exp)) \ panic msg; \ } while (0) #else #define VNET_ASSERT(exp, msg) do { \ } while (0) #endif #ifdef VNET_DEBUG void vnet_log_recursion(struct vnet *, const char *, int); #define CURVNET_SET_QUIET(arg) \ VNET_ASSERT((arg) != NULL && (arg)->vnet_magic_n == VNET_MAGIC_N, \ ("CURVNET_SET at %s:%d %s() curvnet=%p vnet=%p", \ __FILE__, __LINE__, __func__, curvnet, (arg))); \ struct vnet *saved_vnet = curvnet; \ const char *saved_vnet_lpush = curthread->td_vnet_lpush; \ curvnet = arg; \ curthread->td_vnet_lpush = __func__; #define CURVNET_SET_VERBOSE(arg) \ CURVNET_SET_QUIET(arg) \ if (saved_vnet) \ vnet_log_recursion(saved_vnet, saved_vnet_lpush, __LINE__); #define CURVNET_SET(arg) CURVNET_SET_VERBOSE(arg) #define CURVNET_RESTORE() \ VNET_ASSERT(curvnet != NULL && (saved_vnet == NULL || \ saved_vnet->vnet_magic_n == VNET_MAGIC_N), \ ("CURVNET_RESTORE at %s:%d %s() curvnet=%p saved_vnet=%p", \ __FILE__, __LINE__, __func__, curvnet, saved_vnet)); \ curvnet = saved_vnet; \ curthread->td_vnet_lpush = saved_vnet_lpush; #else /* !VNET_DEBUG */ #define CURVNET_SET_QUIET(arg) \ VNET_ASSERT((arg) != NULL && (arg)->vnet_magic_n == VNET_MAGIC_N, \ ("CURVNET_SET at %s:%d %s() curvnet=%p vnet=%p", \ __FILE__, __LINE__, __func__, curvnet, (arg))); \ struct vnet *saved_vnet = curvnet; \ curvnet = arg; #define CURVNET_SET_VERBOSE(arg) \ CURVNET_SET_QUIET(arg) #define CURVNET_SET(arg) CURVNET_SET_VERBOSE(arg) #define CURVNET_RESTORE() \ VNET_ASSERT(curvnet != NULL && (saved_vnet == NULL || \ saved_vnet->vnet_magic_n == VNET_MAGIC_N), \ ("CURVNET_RESTORE at %s:%d %s() curvnet=%p saved_vnet=%p", \ __FILE__, __LINE__, __func__, curvnet, saved_vnet)); \ curvnet = saved_vnet; #endif /* VNET_DEBUG */ #define CURVNET_ASSERT_SET() \ VNET_ASSERT(curvnet != NULL, ("vnet is not set at %s:%d %s()", \ __FILE__, __LINE__, __func__)) extern struct vnet *vnet0; #define IS_DEFAULT_VNET(arg) ((arg) == vnet0) #define CRED_TO_VNET(cr) (cr)->cr_prison->pr_vnet #define TD_TO_VNET(td) CRED_TO_VNET((td)->td_ucred) #define P_TO_VNET(p) CRED_TO_VNET((p)->p_ucred) /* * Global linked list of all virtual network stacks, along with read locks to * access it. If a caller may sleep while accessing the list, it must use * the sleepable lock macros. */ LIST_HEAD(vnet_list_head, vnet); extern struct vnet_list_head vnet_head; extern struct rwlock vnet_rwlock; extern struct sx vnet_sxlock; #define VNET_LIST_RLOCK() sx_slock(&vnet_sxlock) #define VNET_LIST_RLOCK_NOSLEEP() rw_rlock(&vnet_rwlock) #define VNET_LIST_RUNLOCK() sx_sunlock(&vnet_sxlock) #define VNET_LIST_RUNLOCK_NOSLEEP() rw_runlock(&vnet_rwlock) /* * Iteration macros to walk the global list of virtual network stacks. */ #define VNET_ITERATOR_DECL(arg) struct vnet *arg #define VNET_FOREACH(arg) LIST_FOREACH((arg), &vnet_head, vnet_le) /* * Virtual network stack memory allocator, which allows global variables to * be automatically instantiated for each network stack instance. */ #define VNET_NAME(n) vnet_entry_##n #define VNET_DECLARE(t, n) extern t VNET_NAME(n) /* struct _hack is to stop this from being used with static data */ #define VNET_DEFINE(t, n) \ struct _hack; t VNET_NAME(n) __section(VNET_SETNAME) __used #if defined(KLD_MODULE) && (defined(__aarch64__) || defined(__riscv) \ || defined(__powerpc64__) || defined(__i386__)) /* * As with DPCPU_DEFINE_STATIC we are unable to mark this data as static * in modules on some architectures. */ #define VNET_DEFINE_STATIC(t, n) \ t VNET_NAME(n) __section(VNET_SETNAME) __used #else #define VNET_DEFINE_STATIC(t, n) \ static t VNET_NAME(n) __section(VNET_SETNAME) __used #endif #define _VNET_PTR(b, n) (__typeof(VNET_NAME(n))*) \ ((b) + (uintptr_t)&VNET_NAME(n)) #define _VNET(b, n) (*_VNET_PTR(b, n)) /* * Virtualized global variable accessor macros. */ #define VNET_VNET_PTR(vnet, n) _VNET_PTR((vnet)->vnet_data_base, n) #define VNET_VNET(vnet, n) (*VNET_VNET_PTR((vnet), n)) #define VNET_PTR(n) VNET_VNET_PTR(curvnet, n) #define VNET(n) VNET_VNET(curvnet, n) /* * Virtual network stack allocator interfaces from the kernel linker. */ void *vnet_data_alloc(int size); void vnet_data_copy(void *start, int size); void vnet_data_free(void *start_arg, int size); +/* + * Interfaces to manipulate the initial values of virtualized global variables. + */ +void vnet_save_init(void *, size_t); +void vnet_restore_init(void *, size_t); + /* * Virtual sysinit mechanism, allowing network stack components to declare * startup and shutdown methods to be run when virtual network stack * instances are created and destroyed. */ #include /* * SYSINIT/SYSUNINIT variants that provide per-vnet constructors and * destructors. */ struct vnet_sysinit { enum sysinit_sub_id subsystem; enum sysinit_elem_order order; sysinit_cfunc_t func; const void *arg; TAILQ_ENTRY(vnet_sysinit) link; }; #define VNET_SYSINIT(ident, subsystem, order, func, arg) \ CTASSERT((subsystem) > SI_SUB_VNET && \ (subsystem) <= SI_SUB_VNET_DONE); \ static struct vnet_sysinit ident ## _vnet_init = { \ subsystem, \ order, \ (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ (arg) \ }; \ SYSINIT(vnet_init_ ## ident, subsystem, order, \ vnet_register_sysinit, &ident ## _vnet_init); \ SYSUNINIT(vnet_init_ ## ident, subsystem, order, \ vnet_deregister_sysinit, &ident ## _vnet_init) #define VNET_SYSUNINIT(ident, subsystem, order, func, arg) \ CTASSERT((subsystem) > SI_SUB_VNET && \ (subsystem) <= SI_SUB_VNET_DONE); \ static struct vnet_sysinit ident ## _vnet_uninit = { \ subsystem, \ order, \ (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ (arg) \ }; \ SYSINIT(vnet_uninit_ ## ident, subsystem, order, \ vnet_register_sysuninit, &ident ## _vnet_uninit); \ SYSUNINIT(vnet_uninit_ ## ident, subsystem, order, \ vnet_deregister_sysuninit, &ident ## _vnet_uninit) /* * Interfaces for managing per-vnet constructors and destructors. */ void vnet_register_sysinit(void *arg); void vnet_register_sysuninit(void *arg); void vnet_deregister_sysinit(void *arg); void vnet_deregister_sysuninit(void *arg); /* * EVENTHANDLER(9) extensions. */ #include void vnet_global_eventhandler_iterator_func(void *, ...); #define VNET_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ do { \ if (IS_DEFAULT_VNET(curvnet)) { \ (tag) = vimage_eventhandler_register(NULL, #name, func, \ arg, priority, \ vnet_global_eventhandler_iterator_func); \ } \ } while(0) #define VNET_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ do { \ if (IS_DEFAULT_VNET(curvnet)) { \ vimage_eventhandler_register(NULL, #name, func, \ arg, priority, \ vnet_global_eventhandler_iterator_func); \ } \ } while(0) #else /* !VIMAGE */ /* * Various virtual network stack macros compile to no-ops without VIMAGE. */ #define curvnet NULL #define VNET_ASSERT(exp, msg) #define CURVNET_SET(arg) #define CURVNET_SET_QUIET(arg) #define CURVNET_RESTORE() #define CURVNET_ASSERT_SET() \ #define VNET_LIST_RLOCK() #define VNET_LIST_RLOCK_NOSLEEP() #define VNET_LIST_RUNLOCK() #define VNET_LIST_RUNLOCK_NOSLEEP() #define VNET_ITERATOR_DECL(arg) #define VNET_FOREACH(arg) for (int _vn = 0; _vn == 0; _vn++) #define IS_DEFAULT_VNET(arg) 1 #define CRED_TO_VNET(cr) NULL #define TD_TO_VNET(td) NULL #define P_TO_VNET(p) NULL /* * Versions of the VNET macros that compile to normal global variables and * standard sysctl definitions. */ #define VNET_NAME(n) n #define VNET_DECLARE(t, n) extern t n #define VNET_DEFINE(t, n) struct _hack; t n #define VNET_DEFINE_STATIC(t, n) static t n #define _VNET_PTR(b, n) &VNET_NAME(n) /* * Virtualized global variable accessor macros. */ #define VNET_VNET_PTR(vnet, n) (&(n)) #define VNET_VNET(vnet, n) (n) #define VNET_PTR(n) (&(n)) #define VNET(n) (n) /* * When VIMAGE isn't compiled into the kernel, VNET_SYSINIT/VNET_SYSUNINIT * map into normal sysinits, which have the same ordering properties. */ #define VNET_SYSINIT(ident, subsystem, order, func, arg) \ SYSINIT(ident, subsystem, order, func, arg) #define VNET_SYSUNINIT(ident, subsystem, order, func, arg) \ SYSUNINIT(ident, subsystem, order, func, arg) /* * Without VIMAGE revert to the default implementation. */ #define VNET_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ (tag) = eventhandler_register(NULL, #name, func, arg, priority) #define VNET_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ eventhandler_register(NULL, #name, func, arg, priority) #endif /* VIMAGE */ #endif /* _KERNEL */ #endif /* !_NET_VNET_H_ */ diff --git a/sys/sys/eventhandler.h b/sys/sys/eventhandler.h index 47024ecf87a9..c0d9811dd1b9 100644 --- a/sys/sys/eventhandler.h +++ b/sys/sys/eventhandler.h @@ -1,329 +1,334 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1999 Michael Smith * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_EVENTHANDLER_H_ #define _SYS_EVENTHANDLER_H_ #include #include #include #include #include #ifdef VIMAGE struct eventhandler_entry_vimage { void (* func)(void); /* Original function registered. */ void *ee_arg; /* Original argument registered. */ void *sparep[2]; }; #endif struct eventhandler_list { char *el_name; int el_flags; /* Unused. */ u_int el_runcount; struct mtx el_lock; TAILQ_ENTRY(eventhandler_list) el_link; TAILQ_HEAD(,eventhandler_entry) el_entries; }; #define EHL_LOCK(p) mtx_lock(&(p)->el_lock) #define EHL_UNLOCK(p) mtx_unlock(&(p)->el_lock) #define EHL_LOCK_ASSERT(p, x) mtx_assert(&(p)->el_lock, x) /* * Macro to invoke the handlers for a given event. */ #define _EVENTHANDLER_INVOKE(name, list, ...) do { \ struct eventhandler_entry *_ep; \ struct eventhandler_entry_ ## name *_t; \ \ EHL_LOCK_ASSERT((list), MA_OWNED); \ (list)->el_runcount++; \ KASSERT((list)->el_runcount > 0, \ ("eventhandler_invoke: runcount overflow")); \ CTR0(KTR_EVH, "eventhandler_invoke(\"" __STRING(name) "\")"); \ TAILQ_FOREACH(_ep, &((list)->el_entries), ee_link) { \ if (_ep->ee_priority != EHE_DEAD_PRIORITY) { \ EHL_UNLOCK((list)); \ _t = (struct eventhandler_entry_ ## name *)_ep; \ CTR1(KTR_EVH, "eventhandler_invoke: executing %p", \ (void *)_t->eh_func); \ _t->eh_func(_ep->ee_arg , ## __VA_ARGS__); \ EHL_LOCK((list)); \ } \ } \ KASSERT((list)->el_runcount > 0, \ ("eventhandler_invoke: runcount underflow")); \ (list)->el_runcount--; \ if ((list)->el_runcount == 0) \ eventhandler_prune_list(list); \ EHL_UNLOCK((list)); \ } while (0) /* * You can optionally use the EVENTHANDLER_LIST and EVENTHANDLER_DIRECT macros * to pre-define a symbol for the eventhandler list. This symbol can be used by * EVENTHANDLER_DIRECT_INVOKE, which has the advantage of not needing to do a * locked search of the global list of eventhandler lists. At least * EVENTHANDLER_LIST_DEFINE must be used for EVENTHANDLER_DIRECT_INVOKE to * work. EVENTHANDLER_LIST_DECLARE is only needed if the call to * EVENTHANDLER_DIRECT_INVOKE is in a different compilation unit from * EVENTHANDLER_LIST_DEFINE. If the events are even relatively high frequency * it is suggested that you directly define a list for them. */ #define EVENTHANDLER_LIST_DEFINE(name) \ struct eventhandler_list *_eventhandler_list_ ## name ; \ static void _ehl_init_ ## name (void * ctx __unused) \ { \ _eventhandler_list_ ## name = eventhandler_create_list(#name); \ } \ SYSINIT(name ## _ehl_init, SI_SUB_EVENTHANDLER, SI_ORDER_ANY, \ _ehl_init_ ## name, NULL); \ struct __hack #define EVENTHANDLER_DIRECT_INVOKE(name, ...) do { \ struct eventhandler_list *_el; \ \ _el = _eventhandler_list_ ## name ; \ if (!TAILQ_EMPTY(&_el->el_entries)) { \ EHL_LOCK(_el); \ _EVENTHANDLER_INVOKE(name, _el , ## __VA_ARGS__); \ } \ } while (0) #define EVENTHANDLER_DEFINE(name, func, arg, priority) \ static eventhandler_tag name ## _tag; \ static void name ## _evh_init(void *ctx) \ { \ name ## _tag = EVENTHANDLER_REGISTER(name, func, ctx, \ priority); \ } \ SYSINIT(name ## _evh_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, \ name ## _evh_init, arg); \ struct __hack #define EVENTHANDLER_INVOKE(name, ...) \ do { \ struct eventhandler_list *_el; \ \ if ((_el = eventhandler_find_list(#name)) != NULL) \ _EVENTHANDLER_INVOKE(name, _el , ## __VA_ARGS__); \ } while (0) #define EVENTHANDLER_REGISTER(name, func, arg, priority) \ eventhandler_register(NULL, #name, func, arg, priority) #define EVENTHANDLER_DEREGISTER(name, tag) \ do { \ struct eventhandler_list *_el; \ \ if ((_el = eventhandler_find_list(#name)) != NULL) \ eventhandler_deregister(_el, tag); \ } while (0) #define EVENTHANDLER_DEREGISTER_NOWAIT(name, tag) \ do { \ struct eventhandler_list *_el; \ \ if ((_el = eventhandler_find_list(#name)) != NULL) \ eventhandler_deregister_nowait(_el, tag); \ } while (0) eventhandler_tag eventhandler_register(struct eventhandler_list *list, const char *name, void *func, void *arg, int priority); void eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag); void eventhandler_deregister_nowait(struct eventhandler_list *list, eventhandler_tag tag); struct eventhandler_list *eventhandler_find_list(const char *name); void eventhandler_prune_list(struct eventhandler_list *list); struct eventhandler_list *eventhandler_create_list(const char *name); #ifdef VIMAGE typedef void (*vimage_iterator_func_t)(void *, ...); eventhandler_tag vimage_eventhandler_register(struct eventhandler_list *list, const char *name, void *func, void *arg, int priority, vimage_iterator_func_t); #endif /* * Standard system event queues. */ /* Generic priority levels */ #define EVENTHANDLER_PRI_FIRST 0 #define EVENTHANDLER_PRI_ANY 10000 #define EVENTHANDLER_PRI_LAST 20000 /* * Successive shutdown events invoked by kern_reboot(9). * * Handlers will receive the 'howto' value as their second argument. * * All handlers must be prepared to be executed from a panic/debugger context; * see the man page for details. */ typedef void (*shutdown_fn)(void *, int); #define SHUTDOWN_PRI_FIRST EVENTHANDLER_PRI_FIRST #define SHUTDOWN_PRI_DEFAULT EVENTHANDLER_PRI_ANY #define SHUTDOWN_PRI_LAST EVENTHANDLER_PRI_LAST EVENTHANDLER_DECLARE(shutdown_pre_sync, shutdown_fn); /* before fs sync */ EVENTHANDLER_DECLARE(shutdown_post_sync, shutdown_fn); /* after fs sync */ EVENTHANDLER_DECLARE(shutdown_final, shutdown_fn); /* Power state change events */ typedef void (*power_change_fn)(void *); EVENTHANDLER_DECLARE(power_resume, power_change_fn); EVENTHANDLER_DECLARE(power_suspend, power_change_fn); EVENTHANDLER_DECLARE(power_suspend_early, power_change_fn); /* Low memory event */ typedef void (*vm_lowmem_handler_t)(void *, int); #define LOWMEM_PRI_DEFAULT EVENTHANDLER_PRI_FIRST EVENTHANDLER_DECLARE(vm_lowmem, vm_lowmem_handler_t); /* Some of mbuf(9) zones reached maximum */ EVENTHANDLER_DECLARE(mbuf_lowmem, vm_lowmem_handler_t); /* Root mounted event */ typedef void (*mountroot_handler_t)(void *); EVENTHANDLER_DECLARE(mountroot, mountroot_handler_t); /* File system mount events */ struct mount; struct vnode; struct thread; typedef void (*vfs_mounted_notify_fn)(void *, struct mount *, struct vnode *, struct thread *); typedef void (*vfs_unmounted_notify_fn)(void *, struct mount *, struct thread *); EVENTHANDLER_DECLARE(vfs_mounted, vfs_mounted_notify_fn); EVENTHANDLER_DECLARE(vfs_unmounted, vfs_unmounted_notify_fn); /* * Process events * process_fork and exit handlers are called without Giant. * exec handlers are called with Giant, but that is by accident. */ struct proc; struct image_params; typedef void (*exitlist_fn)(void *, struct proc *); typedef void (*forklist_fn)(void *, struct proc *, struct proc *, int); typedef void (*execlist_fn)(void *, struct proc *, struct image_params *); typedef void (*proc_ctor_fn)(void *, struct proc *); typedef void (*proc_dtor_fn)(void *, struct proc *); typedef void (*proc_init_fn)(void *, struct proc *); typedef void (*proc_fini_fn)(void *, struct proc *); EVENTHANDLER_DECLARE(process_ctor, proc_ctor_fn); EVENTHANDLER_DECLARE(process_dtor, proc_dtor_fn); EVENTHANDLER_DECLARE(process_init, proc_init_fn); EVENTHANDLER_DECLARE(process_fini, proc_fini_fn); EVENTHANDLER_DECLARE(process_exit, exitlist_fn); EVENTHANDLER_DECLARE(process_fork, forklist_fn); EVENTHANDLER_DECLARE(process_exec, execlist_fn); /* * application dump event */ typedef void (*app_coredump_start_fn)(void *, struct thread *, char *name); typedef void (*app_coredump_progress_fn)(void *, struct thread *td, int byte_count); typedef void (*app_coredump_finish_fn)(void *, struct thread *td); typedef void (*app_coredump_error_fn)(void *, struct thread *td, char *msg, ...); EVENTHANDLER_DECLARE(app_coredump_start, app_coredump_start_fn); EVENTHANDLER_DECLARE(app_coredump_progress, app_coredump_progress_fn); EVENTHANDLER_DECLARE(app_coredump_finish, app_coredump_finish_fn); EVENTHANDLER_DECLARE(app_coredump_error, app_coredump_error_fn); typedef void (*thread_ctor_fn)(void *, struct thread *); typedef void (*thread_dtor_fn)(void *, struct thread *); typedef void (*thread_fini_fn)(void *, struct thread *); typedef void (*thread_init_fn)(void *, struct thread *); EVENTHANDLER_DECLARE(thread_ctor, thread_ctor_fn); EVENTHANDLER_DECLARE(thread_dtor, thread_dtor_fn); EVENTHANDLER_DECLARE(thread_init, thread_init_fn); EVENTHANDLER_DECLARE(thread_fini, thread_fini_fn); typedef void (*uma_zone_chfn)(void *); EVENTHANDLER_DECLARE(nmbclusters_change, uma_zone_chfn); EVENTHANDLER_DECLARE(nmbufs_change, uma_zone_chfn); EVENTHANDLER_DECLARE(maxsockets_change, uma_zone_chfn); /* Kernel linker file load and unload events */ struct linker_file; typedef void (*kld_load_fn)(void *, struct linker_file *); typedef void (*kld_unload_fn)(void *, const char *, caddr_t, size_t); typedef void (*kld_unload_try_fn)(void *, struct linker_file *, int *); EVENTHANDLER_DECLARE(kld_load, kld_load_fn); EVENTHANDLER_DECLARE(kld_unload, kld_unload_fn); EVENTHANDLER_DECLARE(kld_unload_try, kld_unload_try_fn); /* Generic graphics framebuffer interface */ struct fb_info; typedef void (*register_framebuffer_fn)(void *, struct fb_info *); typedef void (*unregister_framebuffer_fn)(void *, struct fb_info *); EVENTHANDLER_DECLARE(register_framebuffer, register_framebuffer_fn); EVENTHANDLER_DECLARE(unregister_framebuffer, unregister_framebuffer_fn); /* Veto ada attachment */ struct cam_path; struct ata_params; typedef void (*ada_probe_veto_fn)(void *, struct cam_path *, struct ata_params *, int *); EVENTHANDLER_DECLARE(ada_probe_veto, ada_probe_veto_fn); /* Swap device events */ struct swdevt; typedef void (*swapon_fn)(void *, struct swdevt *); typedef void (*swapoff_fn)(void *, struct swdevt *); EVENTHANDLER_DECLARE(swapon, swapon_fn); EVENTHANDLER_DECLARE(swapoff, swapoff_fn); /* newbus device events */ enum evhdev_detach { EVHDEV_DETACH_BEGIN, /* Before detach() is called */ EVHDEV_DETACH_COMPLETE, /* After detach() returns 0 */ EVHDEV_DETACH_FAILED /* After detach() returns err */ }; typedef void (*device_attach_fn)(void *, device_t); typedef void (*device_detach_fn)(void *, device_t, enum evhdev_detach); typedef void (*device_nomatch_fn)(void *, device_t); EVENTHANDLER_DECLARE(device_attach, device_attach_fn); EVENTHANDLER_DECLARE(device_detach, device_detach_fn); EVENTHANDLER_DECLARE(device_nomatch, device_nomatch_fn); /* Interface address addition and removal event */ struct ifaddr; typedef void (*rt_addrmsg_fn)(void *, struct ifaddr *, int); EVENTHANDLER_DECLARE(rt_addrmsg, rt_addrmsg_fn); +/* Kernel environment variable change event */ +typedef void (*env_change_fn)(void *, const char *); +EVENTHANDLER_DECLARE(setenv, env_change_fn); +EVENTHANDLER_DECLARE(unsetenv, env_change_fn); + #endif /* _SYS_EVENTHANDLER_H_ */