diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c index 3185e77aba46..2d384b6b9a60 100644 --- a/sys/kern/kern_physio.c +++ b/sys/kern/kern_physio.c @@ -1,213 +1,213 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1994 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int physio(struct cdev *dev, struct uio *uio, int ioflag) { struct cdevsw *csw; struct buf *pbuf; struct bio *bp; struct vm_page **pages; char *base, *sa; u_int iolen, poff; int error, i, npages, maxpages; vm_prot_t prot; csw = dev->si_devsw; npages = 0; sa = NULL; /* check if character device is being destroyed */ if (csw == NULL) return (ENXIO); /* XXX: sanity check */ - if(dev->si_iosize_max < PAGE_SIZE) { + if (dev->si_iosize_max < PAGE_SIZE) { printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n", devtoname(dev), dev->si_iosize_max); dev->si_iosize_max = DFLTPHYS; } /* * If the driver does not want I/O to be split, that means that we * need to reject any requests that will not fit into one buffer. */ if (dev->si_flags & SI_NOSPLIT && (uio->uio_resid > dev->si_iosize_max || uio->uio_resid > maxphys || uio->uio_iovcnt > 1)) { /* * Tell the user why his I/O was rejected. */ if (uio->uio_resid > dev->si_iosize_max) uprintf("%s: request size=%zd > si_iosize_max=%d; " "cannot split request\n", devtoname(dev), uio->uio_resid, dev->si_iosize_max); if (uio->uio_resid > maxphys) uprintf("%s: request size=%zd > maxphys=%lu; " "cannot split request\n", devtoname(dev), uio->uio_resid, maxphys); if (uio->uio_iovcnt > 1) uprintf("%s: request vectors=%d > 1; " "cannot split request\n", devtoname(dev), uio->uio_iovcnt); return (EFBIG); } /* * Keep the process UPAGES from being swapped. Processes swapped * out while holding pbufs, used by swapper, may lead to deadlock. */ PHOLD(curproc); bp = g_alloc_bio(); if (uio->uio_segflg != UIO_USERSPACE) { pbuf = NULL; pages = NULL; } else if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { pbuf = NULL; maxpages = btoc(MIN(uio->uio_resid, maxphys)) + 1; pages = malloc(sizeof(*pages) * maxpages, M_DEVBUF, M_WAITOK); } else { pbuf = uma_zalloc(pbuf_zone, M_WAITOK); MPASS((pbuf->b_flags & B_MAXPHYS) != 0); sa = pbuf->b_data; maxpages = PBUF_PAGES; pages = pbuf->b_pages; } prot = VM_PROT_READ; if (uio->uio_rw == UIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ error = 0; for (i = 0; i < uio->uio_iovcnt; i++) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); if (uio->uio_rw == UIO_READ) { racct_add_force(curproc, RACCT_READBPS, uio->uio_iov[i].iov_len); racct_add_force(curproc, RACCT_READIOPS, 1); } else { racct_add_force(curproc, RACCT_WRITEBPS, uio->uio_iov[i].iov_len); racct_add_force(curproc, RACCT_WRITEIOPS, 1); } PROC_UNLOCK(curproc); } #endif /* RACCT */ while (uio->uio_iov[i].iov_len) { g_reset_bio(bp); if (uio->uio_rw == UIO_READ) { bp->bio_cmd = BIO_READ; curthread->td_ru.ru_inblock++; } else { bp->bio_cmd = BIO_WRITE; curthread->td_ru.ru_oublock++; } bp->bio_offset = uio->uio_offset; base = uio->uio_iov[i].iov_base; bp->bio_length = uio->uio_iov[i].iov_len; if (bp->bio_length > dev->si_iosize_max) bp->bio_length = dev->si_iosize_max; if (bp->bio_length > maxphys) bp->bio_length = maxphys; bp->bio_bcount = bp->bio_length; bp->bio_dev = dev; if (pages) { if ((npages = vm_fault_quick_hold_pages( &curproc->p_vmspace->vm_map, (vm_offset_t)base, bp->bio_length, prot, pages, maxpages)) < 0) { error = EFAULT; goto doerror; } poff = (vm_offset_t)base & PAGE_MASK; if (pbuf && sa) { pmap_qenter((vm_offset_t)sa, pages, npages); bp->bio_data = sa + poff; } else { bp->bio_ma = pages; bp->bio_ma_n = npages; bp->bio_ma_offset = poff; bp->bio_data = unmapped_buf; bp->bio_flags |= BIO_UNMAPPED; } } else bp->bio_data = base; csw->d_strategy(bp); if (uio->uio_rw == UIO_READ) biowait(bp, "physrd"); else biowait(bp, "physwr"); if (pages) { if (pbuf) pmap_qremove((vm_offset_t)sa, npages); vm_page_unhold_pages(pages, npages); } iolen = bp->bio_length - bp->bio_resid; if (iolen == 0 && !(bp->bio_flags & BIO_ERROR)) goto doerror; /* EOF */ uio->uio_iov[i].iov_len -= iolen; uio->uio_iov[i].iov_base = (char *)uio->uio_iov[i].iov_base + iolen; uio->uio_resid -= iolen; uio->uio_offset += iolen; if (bp->bio_flags & BIO_ERROR) { error = bp->bio_error; goto doerror; } } } doerror: if (pbuf) uma_zfree(pbuf_zone, pbuf); else if (pages) free(pages, M_DEVBUF); g_destroy_bio(bp); PRELE(curproc); return (error); } diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c index ffb6ac196ba3..011e3f44a124 100644 --- a/sys/kern/kern_sysctl.c +++ b/sys/kern/kern_sysctl.c @@ -1,2985 +1,2985 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD * project, to make these variables more userfriendly. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include "opt_sysctl.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef DDB #include #include #endif #include #include #include #include static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic"); static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids"); static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer"); /* * The sysctllock protects the MIB tree. It also protects sysctl * contexts used with dynamic sysctls. The sysctl_register_oid() and * sysctl_unregister_oid() routines require the sysctllock to already * be held, so the sysctl_wlock() and sysctl_wunlock() routines are * provided for the few places in the kernel which need to use that * API rather than using the dynamic API. Use of the dynamic API is * strongly encouraged for most code. * * The sysctlmemlock is used to limit the amount of user memory wired for * sysctl requests. This is implemented by serializing any userland * sysctl requests larger than a single page via an exclusive lock. * * The sysctlstringlock is used to protect concurrent access to writable * string nodes in sysctl_handle_string(). */ static struct rmlock sysctllock; static struct sx __exclusive_cache_line sysctlmemlock; static struct sx sysctlstringlock; #define SYSCTL_WLOCK() rm_wlock(&sysctllock) #define SYSCTL_WUNLOCK() rm_wunlock(&sysctllock) #define SYSCTL_RLOCK(tracker) rm_rlock(&sysctllock, (tracker)) #define SYSCTL_RUNLOCK(tracker) rm_runlock(&sysctllock, (tracker)) #define SYSCTL_WLOCKED() rm_wowned(&sysctllock) #define SYSCTL_ASSERT_LOCKED() rm_assert(&sysctllock, RA_LOCKED) #define SYSCTL_ASSERT_WLOCKED() rm_assert(&sysctllock, RA_WLOCKED) #define SYSCTL_ASSERT_RLOCKED() rm_assert(&sysctllock, RA_RLOCKED) #define SYSCTL_INIT() rm_init_flags(&sysctllock, "sysctl lock", \ RM_SLEEPABLE) #define SYSCTL_SLEEP(ch, wmesg, timo) \ rm_sleep(ch, &sysctllock, 0, wmesg, timo) static int sysctl_root(SYSCTL_HANDLER_ARGS); /* Root list */ struct sysctl_oid_list sysctl__children = SLIST_HEAD_INITIALIZER(&sysctl__children); static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse); static int sysctl_old_kernel(struct sysctl_req *, const void *, size_t); static int sysctl_new_kernel(struct sysctl_req *, void *, size_t); static struct sysctl_oid * sysctl_find_oidname(const char *name, struct sysctl_oid_list *list) { struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SLIST_FOREACH(oidp, list, oid_link) { if (strcmp(oidp->oid_name, name) == 0) { return (oidp); } } return (NULL); } /* * Initialization of the MIB tree. * * Order by number in each list. */ void sysctl_wlock(void) { SYSCTL_WLOCK(); } void sysctl_wunlock(void) { SYSCTL_WUNLOCK(); } static int sysctl_root_handler_locked(struct sysctl_oid *oid, void *arg1, intmax_t arg2, struct sysctl_req *req, struct rm_priotracker *tracker) { int error; if (oid->oid_kind & CTLFLAG_DYN) atomic_add_int(&oid->oid_running, 1); if (tracker != NULL) SYSCTL_RUNLOCK(tracker); else SYSCTL_WUNLOCK(); /* * Treat set CTLFLAG_NEEDGIANT and unset CTLFLAG_MPSAFE flags the same, * untill we're ready to remove all traces of Giant from sysctl(9). */ if ((oid->oid_kind & CTLFLAG_NEEDGIANT) || (!(oid->oid_kind & CTLFLAG_MPSAFE))) mtx_lock(&Giant); error = oid->oid_handler(oid, arg1, arg2, req); if ((oid->oid_kind & CTLFLAG_NEEDGIANT) || (!(oid->oid_kind & CTLFLAG_MPSAFE))) mtx_unlock(&Giant); KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error); if (tracker != NULL) SYSCTL_RLOCK(tracker); else SYSCTL_WLOCK(); if (oid->oid_kind & CTLFLAG_DYN) { if (atomic_fetchadd_int(&oid->oid_running, -1) == 1 && (oid->oid_kind & CTLFLAG_DYING) != 0) wakeup(&oid->oid_running); } return (error); } static void sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp) { struct sysctl_req req; struct sysctl_oid *curr; char *penv = NULL; char path[96]; ssize_t rem = sizeof(path); ssize_t len; uint8_t data[512] __aligned(sizeof(uint64_t)); int size; int error; path[--rem] = 0; for (curr = oidp; curr != NULL; curr = SYSCTL_PARENT(curr)) { len = strlen(curr->oid_name); rem -= len; if (curr != oidp) rem -= 1; if (rem < 0) { printf("OID path exceeds %d bytes\n", (int)sizeof(path)); return; } memcpy(path + rem, curr->oid_name, len); if (curr != oidp) path[rem + len] = '.'; } memset(&req, 0, sizeof(req)); req.td = curthread; req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_INT: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_UINT: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_LONG: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(long), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_ULONG: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(long), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S8: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int8_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S16: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int16_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S32: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int32_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S64: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int64_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U8: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint8_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U16: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint16_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U32: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint32_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U64: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint64_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_STRING: penv = kern_getenv(path + rem); if (penv == NULL) return; req.newlen = strlen(penv); req.newptr = penv; break; default: return; } error = sysctl_root_handler_locked(oidp, oidp->oid_arg1, oidp->oid_arg2, &req, NULL); if (error != 0) printf("Setting sysctl %s failed: %d\n", path + rem, error); if (penv != NULL) freeenv(penv); } /* * Locate the path to a given oid. Returns the length of the resulting path, * or -1 if the oid was not found. nodes must have room for CTL_MAXNAME * elements and be NULL initialized. */ static int sysctl_search_oid(struct sysctl_oid **nodes, struct sysctl_oid *needle) { int indx; SYSCTL_ASSERT_LOCKED(); indx = 0; while (indx < CTL_MAXNAME && indx >= 0) { if (nodes[indx] == NULL && indx == 0) nodes[indx] = SLIST_FIRST(&sysctl__children); else if (nodes[indx] == NULL) nodes[indx] = SLIST_FIRST(&nodes[indx - 1]->oid_children); else nodes[indx] = SLIST_NEXT(nodes[indx], oid_link); if (nodes[indx] == needle) return (indx + 1); if (nodes[indx] == NULL) { indx--; continue; } if ((nodes[indx]->oid_kind & CTLTYPE) == CTLTYPE_NODE) { indx++; continue; } } return (-1); } static void sysctl_warn_reuse(const char *func, struct sysctl_oid *leaf) { struct sysctl_oid *nodes[CTL_MAXNAME]; char buf[128]; struct sbuf sb; int rc, i; (void)sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); sbuf_printf(&sb, "%s: can't re-use a leaf (", __func__); memset(nodes, 0, sizeof(nodes)); rc = sysctl_search_oid(nodes, leaf); if (rc > 0) { for (i = 0; i < rc; i++) sbuf_printf(&sb, "%s%.*s", nodes[i]->oid_name, i != (rc - 1), "."); } else { sbuf_printf(&sb, "%s", leaf->oid_name); } sbuf_printf(&sb, ")!\n"); (void)sbuf_finish(&sb); } #ifdef SYSCTL_DEBUG static int sysctl_reuse_test(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; SYSCTL_RLOCK(&tracker); sysctl_warn_reuse(__func__, oidp); SYSCTL_RUNLOCK(&tracker); return (0); } SYSCTL_PROC(_sysctl, OID_AUTO, reuse_test, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_reuse_test, "-", ""); #endif void sysctl_register_oid(struct sysctl_oid *oidp) { struct sysctl_oid_list *parent = oidp->oid_parent; struct sysctl_oid *p; struct sysctl_oid *q; int oid_number; int timeout = 2; /* * First check if another oid with the same name already * exists in the parent's list. */ SYSCTL_ASSERT_WLOCKED(); p = sysctl_find_oidname(oidp->oid_name, parent); if (p != NULL) { if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) { p->oid_refcnt++; return; } else { sysctl_warn_reuse(__func__, p); return; } } /* get current OID number */ oid_number = oidp->oid_number; #if (OID_AUTO >= 0) #error "OID_AUTO is expected to be a negative value" #endif /* * Any negative OID number qualifies as OID_AUTO. Valid OID * numbers should always be positive. * * NOTE: DO NOT change the starting value here, change it in * , and make sure it is at least 256 to * accommodate e.g. net.inet.raw as a static sysctl node. */ if (oid_number < 0) { static int newoid; /* * By decrementing the next OID number we spend less * time inserting the OIDs into a sorted list. */ if (--newoid < CTL_AUTO_START) newoid = 0x7fffffff; oid_number = newoid; } /* * Insert the OID into the parent's list sorted by OID number. */ retry: q = NULL; SLIST_FOREACH(p, parent, oid_link) { /* check if the current OID number is in use */ if (oid_number == p->oid_number) { /* get the next valid OID number */ if (oid_number < CTL_AUTO_START || oid_number == 0x7fffffff) { /* wraparound - restart */ oid_number = CTL_AUTO_START; /* don't loop forever */ if (!timeout--) panic("sysctl: Out of OID numbers\n"); goto retry; } else { oid_number++; } } else if (oid_number < p->oid_number) break; q = p; } /* check for non-auto OID number collision */ if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START && oid_number >= CTL_AUTO_START) { printf("sysctl: OID number(%d) is already in use for '%s'\n", oidp->oid_number, oidp->oid_name); } /* update the OID number, if any */ oidp->oid_number = oid_number; if (q != NULL) SLIST_INSERT_AFTER(q, oidp, oid_link); else SLIST_INSERT_HEAD(parent, oidp, oid_link); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && #ifdef VIMAGE (oidp->oid_kind & CTLFLAG_VNET) == 0 && #endif (oidp->oid_kind & CTLFLAG_TUN) != 0 && (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { /* only fetch value once */ oidp->oid_kind |= CTLFLAG_NOFETCH; /* try to fetch value from kernel environment */ sysctl_load_tunable_by_oid_locked(oidp); } } void sysctl_register_disabled_oid(struct sysctl_oid *oidp) { /* * Mark the leaf as dormant if it's not to be immediately enabled. * We do not disable nodes as they can be shared between modules * and it is always safe to access a node. */ KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0, ("internal flag is set in oid_kind")); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) oidp->oid_kind |= CTLFLAG_DORMANT; sysctl_register_oid(oidp); } void sysctl_enable_oid(struct sysctl_oid *oidp) { SYSCTL_ASSERT_WLOCKED(); if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0, ("sysctl node is marked as dormant")); return; } KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) != 0, ("enabling already enabled sysctl oid")); oidp->oid_kind &= ~CTLFLAG_DORMANT; } void sysctl_unregister_oid(struct sysctl_oid *oidp) { struct sysctl_oid *p; int error; SYSCTL_ASSERT_WLOCKED(); if (oidp->oid_number == OID_AUTO) { error = EINVAL; } else { error = ENOENT; SLIST_FOREACH(p, oidp->oid_parent, oid_link) { if (p == oidp) { SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); error = 0; break; } } } /* * This can happen when a module fails to register and is * being unloaded afterwards. It should not be a panic() * for normal use. */ if (error) { printf("%s: failed(%d) to unregister sysctl(%s)\n", __func__, error, oidp->oid_name); } } /* Initialize a new context to keep track of dynamically added sysctls. */ int sysctl_ctx_init(struct sysctl_ctx_list *c) { if (c == NULL) { return (EINVAL); } /* * No locking here, the caller is responsible for not adding * new nodes to a context until after this function has * returned. */ TAILQ_INIT(c); return (0); } /* Free the context, and destroy all dynamic oids registered in this context */ int sysctl_ctx_free(struct sysctl_ctx_list *clist) { struct sysctl_ctx_entry *e, *e1; int error; error = 0; /* * First perform a "dry run" to check if it's ok to remove oids. * XXX FIXME * XXX This algorithm is a hack. But I don't know any * XXX better solution for now... */ SYSCTL_WLOCK(); TAILQ_FOREACH(e, clist, link) { error = sysctl_remove_oid_locked(e->entry, 0, 0); if (error) break; } /* * Restore deregistered entries, either from the end, * or from the place where error occurred. * e contains the entry that was not unregistered */ if (error) e1 = TAILQ_PREV(e, sysctl_ctx_list, link); else e1 = TAILQ_LAST(clist, sysctl_ctx_list); while (e1 != NULL) { sysctl_register_oid(e1->entry); e1 = TAILQ_PREV(e1, sysctl_ctx_list, link); } if (error) { SYSCTL_WUNLOCK(); return(EBUSY); } /* Now really delete the entries */ e = TAILQ_FIRST(clist); while (e != NULL) { e1 = TAILQ_NEXT(e, link); error = sysctl_remove_oid_locked(e->entry, 1, 0); if (error) panic("sysctl_remove_oid: corrupt tree, entry: %s", e->entry->oid_name); free(e, M_SYSCTLOID); e = e1; } SYSCTL_WUNLOCK(); return (error); } /* Add an entry to the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK); e->entry = oidp; TAILQ_INSERT_HEAD(clist, e, link); return (e); } /* Find an entry in the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); TAILQ_FOREACH(e, clist, link) { - if(e->entry == oidp) + if (e->entry == oidp) return(e); } return (e); } /* * Delete an entry from the context. * NOTE: this function doesn't free oidp! You have to remove it * with sysctl_remove_oid(). */ int sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; if (clist == NULL || oidp == NULL) return (EINVAL); SYSCTL_WLOCK(); e = sysctl_ctx_entry_find(clist, oidp); if (e != NULL) { TAILQ_REMOVE(clist, e, link); SYSCTL_WUNLOCK(); free(e, M_SYSCTLOID); return (0); } else { SYSCTL_WUNLOCK(); return (ENOENT); } } /* * Remove dynamically created sysctl trees. * oidp - top of the tree to be removed * del - if 0 - just deregister, otherwise free up entries as well * recurse - if != 0 traverse the subtree to be deleted */ int sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse) { int error; SYSCTL_WLOCK(); error = sysctl_remove_oid_locked(oidp, del, recurse); SYSCTL_WUNLOCK(); return (error); } int sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del, int recurse) { struct sysctl_oid *p, *tmp; int error; error = ENOENT; SYSCTL_WLOCK(); SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) { if (strcmp(p->oid_name, name) == 0) { error = sysctl_remove_oid_locked(p, del, recurse); break; } } SYSCTL_WUNLOCK(); return (error); } static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse) { struct sysctl_oid *p, *tmp; int error; SYSCTL_ASSERT_WLOCKED(); if (oidp == NULL) return(EINVAL); if ((oidp->oid_kind & CTLFLAG_DYN) == 0) { printf("Warning: can't remove non-dynamic nodes (%s)!\n", oidp->oid_name); return (EINVAL); } /* * WARNING: normal method to do this should be through * sysctl_ctx_free(). Use recursing as the last resort * method to purge your sysctl tree of leftovers... * However, if some other code still references these nodes, * it will panic. */ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oidp->oid_refcnt == 1) { SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(oidp), oid_link, tmp) { if (!recurse) { printf("Warning: failed attempt to " "remove oid %s with child %s\n", oidp->oid_name, p->oid_name); return (ENOTEMPTY); } error = sysctl_remove_oid_locked(p, del, recurse); if (error) return (error); } } } if (oidp->oid_refcnt > 1 ) { oidp->oid_refcnt--; } else { if (oidp->oid_refcnt == 0) { printf("Warning: bad oid_refcnt=%u (%s)!\n", oidp->oid_refcnt, oidp->oid_name); return (EINVAL); } sysctl_unregister_oid(oidp); if (del) { /* * Wait for all threads running the handler to drain. * This preserves the previous behavior when the * sysctl lock was held across a handler invocation, * and is necessary for module unload correctness. */ while (oidp->oid_running > 0) { oidp->oid_kind |= CTLFLAG_DYING; SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0); } if (oidp->oid_descr) free(__DECONST(char *, oidp->oid_descr), M_SYSCTLOID); if (oidp->oid_label) free(__DECONST(char *, oidp->oid_label), M_SYSCTLOID); free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID); free(oidp, M_SYSCTLOID); } } return (0); } /* * Create new sysctls at run time. * clist may point to a valid context initialized with sysctl_ctx_init(). */ struct sysctl_oid * sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int number, const char *name, int kind, void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr, const char *label) { struct sysctl_oid *oidp; /* You have to hook up somewhere.. */ if (parent == NULL) return(NULL); /* Check if the node already exists, otherwise create it */ SYSCTL_WLOCK(); oidp = sysctl_find_oidname(name, parent); if (oidp != NULL) { if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { oidp->oid_refcnt++; /* Update the context */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); SYSCTL_WUNLOCK(); return (oidp); } else { sysctl_warn_reuse(__func__, oidp); SYSCTL_WUNLOCK(); return (NULL); } } oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO); oidp->oid_parent = parent; SLIST_INIT(&oidp->oid_children); oidp->oid_number = number; oidp->oid_refcnt = 1; oidp->oid_name = strdup(name, M_SYSCTLOID); oidp->oid_handler = handler; oidp->oid_kind = CTLFLAG_DYN | kind; oidp->oid_arg1 = arg1; oidp->oid_arg2 = arg2; oidp->oid_fmt = fmt; if (descr != NULL) oidp->oid_descr = strdup(descr, M_SYSCTLOID); if (label != NULL) oidp->oid_label = strdup(label, M_SYSCTLOID); /* Update the context, if used */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); /* Register this oid */ sysctl_register_oid(oidp); SYSCTL_WUNLOCK(); return (oidp); } /* * Rename an existing oid. */ void sysctl_rename_oid(struct sysctl_oid *oidp, const char *name) { char *newname; char *oldname; newname = strdup(name, M_SYSCTLOID); SYSCTL_WLOCK(); oldname = __DECONST(char *, oidp->oid_name); oidp->oid_name = newname; SYSCTL_WUNLOCK(); free(oldname, M_SYSCTLOID); } /* * Reparent an existing oid. */ int sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent) { struct sysctl_oid *oidp; SYSCTL_WLOCK(); if (oid->oid_parent == parent) { SYSCTL_WUNLOCK(); return (0); } oidp = sysctl_find_oidname(oid->oid_name, parent); if (oidp != NULL) { SYSCTL_WUNLOCK(); return (EEXIST); } sysctl_unregister_oid(oid); oid->oid_parent = parent; oid->oid_number = OID_AUTO; sysctl_register_oid(oid); SYSCTL_WUNLOCK(); return (0); } /* * Register the kernel's oids on startup. */ SET_DECLARE(sysctl_set, struct sysctl_oid); static void sysctl_register_all(void *arg) { struct sysctl_oid **oidp; sx_init(&sysctlmemlock, "sysctl mem"); sx_init(&sysctlstringlock, "sysctl string handler"); SYSCTL_INIT(); SYSCTL_WLOCK(); SET_FOREACH(oidp, sysctl_set) sysctl_register_oid(*oidp); SYSCTL_WUNLOCK(); } SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, NULL); /* * "Staff-functions" * * These functions implement a presently undocumented interface * used by the sysctl program to walk the tree, and get the type * so it can print the value. * This interface is under work and consideration, and should probably * be killed with a big axe by the first person who can find the time. * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. * * {CTL_SYSCTL, CTL_SYSCTL_DEBUG} printf the entire MIB-tree. * {CTL_SYSCTL, CTL_SYSCTL_NAME, ...} return the name of the "..." * OID. * {CTL_SYSCTL, CTL_SYSCTL_NEXT, ...} return the next OID, honoring * CTLFLAG_SKIP. * {CTL_SYSCTL, CTL_SYSCTL_NAME2OID} return the OID of the name in * "new" * {CTL_SYSCTL, CTL_SYSCTL_OIDFMT, ...} return the kind & format info * for the "..." OID. * {CTL_SYSCTL, CTL_SYSCTL_OIDDESCR, ...} return the description of the * "..." OID. * {CTL_SYSCTL, CTL_SYSCTL_OIDLABEL, ...} return the aggregation label of * the "..." OID. * {CTL_SYSCTL, CTL_SYSCTL_NEXTNOSKIP, ...} return the next OID, ignoring * CTLFLAG_SKIP. */ #ifdef SYSCTL_DEBUG static void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) { int k; struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SLIST_FOREACH(oidp, l, oid_link) { for (k=0; koid_number, oidp->oid_name); printf("%c%c", oidp->oid_kind & CTLFLAG_RD ? 'R':' ', oidp->oid_kind & CTLFLAG_WR ? 'W':' '); if (oidp->oid_handler) printf(" *Handler"); switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_NODE: printf(" Node\n"); if (!oidp->oid_handler) { sysctl_sysctl_debug_dump_node( SYSCTL_CHILDREN(oidp), i + 2); } break; case CTLTYPE_INT: printf(" Int\n"); break; case CTLTYPE_UINT: printf(" u_int\n"); break; case CTLTYPE_LONG: printf(" Long\n"); break; case CTLTYPE_ULONG: printf(" u_long\n"); break; case CTLTYPE_STRING: printf(" String\n"); break; case CTLTYPE_S8: printf(" int8_t\n"); break; case CTLTYPE_S16: printf(" int16_t\n"); break; case CTLTYPE_S32: printf(" int32_t\n"); break; case CTLTYPE_S64: printf(" int64_t\n"); break; case CTLTYPE_U8: printf(" uint8_t\n"); break; case CTLTYPE_U16: printf(" uint16_t\n"); break; case CTLTYPE_U32: printf(" uint32_t\n"); break; case CTLTYPE_U64: printf(" uint64_t\n"); break; case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; default: printf("\n"); } } } static int sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; int error; error = priv_check(req->td, PRIV_SYSCTL_DEBUG); if (error) return (error); SYSCTL_RLOCK(&tracker); sysctl_sysctl_debug_dump_node(&sysctl__children, 0); SYSCTL_RUNLOCK(&tracker); return (ENOENT); } SYSCTL_PROC(_sysctl, CTL_SYSCTL_DEBUG, debug, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", ""); #endif static int sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int error; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; struct rm_priotracker tracker; char buf[10]; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); while (namelen) { if (!lsp) { snprintf(buf,sizeof(buf),"%d",*name); if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, buf, strlen(buf)); if (error) goto out; namelen--; name++; continue; } lsp2 = NULL; SLIST_FOREACH(oid, lsp, oid_link) { if (oid->oid_number != *name) continue; if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, oid->oid_name, strlen(oid->oid_name)); if (error) goto out; namelen--; name++; if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oid->oid_handler) break; lsp2 = SYSCTL_CHILDREN(oid); break; } lsp = lsp2; } error = SYSCTL_OUT(req, "", 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } /* * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in * capability mode. */ static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NAME, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, ""); enum sysctl_iter_action { ITER_SIBLINGS, /* Not matched, continue iterating siblings */ ITER_CHILDREN, /* Node has children we need to iterate over them */ ITER_FOUND, /* Matching node was found */ }; /* * Tries to find the next node for @name and @namelen. * * Returns next action to take. */ static enum sysctl_iter_action sysctl_sysctl_next_node(struct sysctl_oid *oidp, int *name, unsigned int namelen, bool honor_skip) { if ((oidp->oid_kind & CTLFLAG_DORMANT) != 0) return (ITER_SIBLINGS); if (honor_skip && (oidp->oid_kind & CTLFLAG_SKIP) != 0) return (ITER_SIBLINGS); if (namelen == 0) { /* * We have reached a node with a full name match and are * looking for the next oid in its children. * * For CTL_SYSCTL_NEXTNOSKIP we are done. * * For CTL_SYSCTL_NEXT we skip CTLTYPE_NODE (unless it * has a handler) and move on to the children. */ if (!honor_skip) return (ITER_FOUND); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (ITER_FOUND); /* If node does not have an iterator, treat it as leaf */ if (oidp->oid_handler) return (ITER_FOUND); /* Report oid as a node to iterate */ return (ITER_CHILDREN); } /* * No match yet. Continue seeking the given name. * * We are iterating in order by oid_number, so skip oids lower * than the one we are looking for. * * When the current oid_number is higher than the one we seek, * that means we have reached the next oid in the sequence and * should return it. * * If the oid_number matches the name at this level then we * have to find a node to continue searching at the next level. */ if (oidp->oid_number < *name) return (ITER_SIBLINGS); if (oidp->oid_number > *name) { /* * We have reached the next oid. * * For CTL_SYSCTL_NEXTNOSKIP we are done. * * For CTL_SYSCTL_NEXT we skip CTLTYPE_NODE (unless it * has a handler) and move on to the children. */ if (!honor_skip) return (ITER_FOUND); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (ITER_FOUND); /* If node does not have an iterator, treat it as leaf */ if (oidp->oid_handler) return (ITER_FOUND); return (ITER_CHILDREN); } /* match at a current level */ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (ITER_SIBLINGS); if (oidp->oid_handler) return (ITER_SIBLINGS); return (ITER_CHILDREN); } /* * Recursively walk the sysctl subtree at lsp until we find the given name. * Returns true and fills in next oid data in @next and @len if oid is found. */ static bool sysctl_sysctl_next_action(struct sysctl_oid_list *lsp, int *name, u_int namelen, int *next, int *len, int level, bool honor_skip) { struct sysctl_oid *oidp; bool success = false; enum sysctl_iter_action action; SYSCTL_ASSERT_LOCKED(); SLIST_FOREACH(oidp, lsp, oid_link) { action = sysctl_sysctl_next_node(oidp, name, namelen, honor_skip); if (action == ITER_SIBLINGS) continue; if (action == ITER_FOUND) { success = true; break; } KASSERT((action== ITER_CHILDREN), ("ret(%d)!=ITER_CHILDREN", action)); lsp = SYSCTL_CHILDREN(oidp); if (namelen == 0) { success = sysctl_sysctl_next_action(lsp, NULL, 0, next + 1, len, level + 1, honor_skip); } else { success = sysctl_sysctl_next_action(lsp, name + 1, namelen - 1, next + 1, len, level + 1, honor_skip); if (!success) { /* * We maintain the invariant that current node oid * is >= the oid provided in @name. * As there are no usable children at this node, * current node oid is strictly > than the requested * oid. * Hence, reduce namelen to 0 to allow for picking first * nodes/leafs in the next node in list. */ namelen = 0; } } if (success) break; } if (success) { *next = oidp->oid_number; if (level > *len) *len = level; } return (success); } static int sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int len, error; bool success; struct sysctl_oid_list *lsp = &sysctl__children; struct rm_priotracker tracker; int next[CTL_MAXNAME]; len = 0; SYSCTL_RLOCK(&tracker); success = sysctl_sysctl_next_action(lsp, name, namelen, next, &len, 1, oidp->oid_number == CTL_SYSCTL_NEXT); SYSCTL_RUNLOCK(&tracker); if (!success) return (ENOENT); error = SYSCTL_OUT(req, next, len * sizeof (int)); return (error); } /* * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in * capability mode. */ static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXT, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXTNOSKIP, nextnoskip, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static int name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp) { struct sysctl_oid *oidp; struct sysctl_oid_list *lsp = &sysctl__children; char *p; SYSCTL_ASSERT_LOCKED(); for (*len = 0; *len < CTL_MAXNAME;) { p = strsep(&name, "."); oidp = SLIST_FIRST(lsp); for (;; oidp = SLIST_NEXT(oidp, oid_link)) { if (oidp == NULL) return (ENOENT); if (strcmp(p, oidp->oid_name) == 0) break; } *oid++ = oidp->oid_number; (*len)++; if (name == NULL || *name == '\0') { if (oidpp) *oidpp = oidp; return (0); } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oidp->oid_handler) break; lsp = SYSCTL_CHILDREN(oidp); } return (ENOENT); } static int sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) { char *p; int error, oid[CTL_MAXNAME], len = 0; struct sysctl_oid *op = NULL; struct rm_priotracker tracker; char buf[32]; if (!req->newlen) return (ENOENT); if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ return (ENAMETOOLONG); p = buf; if (req->newlen >= sizeof(buf)) p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK); error = SYSCTL_IN(req, p, req->newlen); if (error) { if (p != buf) free(p, M_SYSCTL); return (error); } p [req->newlen] = '\0'; SYSCTL_RLOCK(&tracker); error = name2oid(p, oid, &len, &op); SYSCTL_RUNLOCK(&tracker); if (p != buf) free(p, M_SYSCTL); if (error) return (error); error = SYSCTL_OUT(req, oid, len * sizeof *oid); return (error); } /* * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in * capability mode. */ SYSCTL_PROC(_sysctl, CTL_SYSCTL_NAME2OID, name2oid, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_fmt == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind)); if (error) goto out; error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDFMT, oidfmt, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, ""); static int sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_descr == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDDESCR, oiddescr, CTLFLAG_RD | CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, ""); static int sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_label == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, oid->oid_label, strlen(oid->oid_label) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDLABEL, oidlabel, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); /* * Default "handler" functions. */ /* * Handle a bool. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_bool(SYSCTL_HANDLER_ARGS) { uint8_t temp; int error; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) temp = *(bool *)arg1 ? 1 : 0; else temp = arg2 ? 1 : 0; error = SYSCTL_OUT(req, &temp, sizeof(temp)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else { error = SYSCTL_IN(req, &temp, sizeof(temp)); if (!error) *(bool *)arg1 = temp ? 1 : 0; } return (error); } /* * Handle an int8_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_8(SYSCTL_HANDLER_ARGS) { int8_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int8_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int16_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_16(SYSCTL_HANDLER_ARGS) { int16_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int16_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int32_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_32(SYSCTL_HANDLER_ARGS) { int32_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int32_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_int(SYSCTL_HANDLER_ARGS) { int tmpout, error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(int)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(int)); return (error); } /* * Based on on sysctl_handle_int() convert milliseconds into ticks. * Note: this is used by TCP. */ int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) { int error, s, tt; tt = *(int *)arg1; s = (int)((int64_t)tt * 1000 / hz); error = sysctl_handle_int(oidp, &s, 0, req); if (error || !req->newptr) return (error); tt = (int)((int64_t)s * hz / 1000); if (tt < 1) return (EINVAL); *(int *)arg1 = tt; return (0); } /* * Handle a long, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_long(SYSCTL_HANDLER_ARGS) { int error = 0; long tmplong; #ifdef SCTL_MASK32 int tmpint; #endif /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmplong = *(long *)arg1; else tmplong = arg2; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { tmpint = tmplong; error = SYSCTL_OUT(req, &tmpint, sizeof(int)); } else #endif error = SYSCTL_OUT(req, &tmplong, sizeof(long)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; #ifdef SCTL_MASK32 else if (req->flags & SCTL_MASK32) { error = SYSCTL_IN(req, &tmpint, sizeof(int)); *(long *)arg1 = (long)tmpint; } #endif else error = SYSCTL_IN(req, arg1, sizeof(long)); return (error); } /* * Handle a 64 bit int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_64(SYSCTL_HANDLER_ARGS) { int error = 0; uint64_t tmpout; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(uint64_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(uint64_t)); return (error); } /* * Handle our generic '\0' terminated 'C' string. * Two cases: * a variable string: point arg1 at it, arg2 is max length. * a constant string: point arg1 at it, arg2 is zero. */ int sysctl_handle_string(SYSCTL_HANDLER_ARGS) { char *tmparg; size_t outlen; int error = 0, ro_string = 0; /* * If the sysctl isn't writable and isn't a preallocated tunable that * can be modified by kenv(2), microoptimise and treat it as a * read-only string. * A zero-length buffer indicates a fixed size read-only * string. In ddb, don't worry about trying to make a malloced * snapshot. */ if ((oidp->oid_kind & (CTLFLAG_WR | CTLFLAG_TUN)) == 0 || arg2 == 0 || kdb_active) { arg2 = strlen((char *)arg1) + 1; ro_string = 1; } if (req->oldptr != NULL) { if (ro_string) { tmparg = arg1; outlen = strlen(tmparg) + 1; } else { tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK); sx_slock(&sysctlstringlock); memcpy(tmparg, arg1, arg2); sx_sunlock(&sysctlstringlock); outlen = strlen(tmparg) + 1; } error = SYSCTL_OUT(req, tmparg, outlen); if (!ro_string) free(tmparg, M_SYSCTLTMP); } else { if (!ro_string) sx_slock(&sysctlstringlock); outlen = strlen((char *)arg1) + 1; if (!ro_string) sx_sunlock(&sysctlstringlock); error = SYSCTL_OUT(req, NULL, outlen); } if (error || !req->newptr) return (error); if (req->newlen - req->newidx >= arg2 || req->newlen - req->newidx < 0) { error = EINVAL; } else if (req->newlen - req->newidx == 0) { sx_xlock(&sysctlstringlock); ((char *)arg1)[0] = '\0'; sx_xunlock(&sysctlstringlock); } else { arg2 = req->newlen - req->newidx; tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK); error = SYSCTL_IN(req, tmparg, arg2); if (error) { free(tmparg, M_SYSCTLTMP); return (error); } sx_xlock(&sysctlstringlock); memcpy(arg1, tmparg, arg2); ((char *)arg1)[arg2] = '\0'; sx_xunlock(&sysctlstringlock); free(tmparg, M_SYSCTLTMP); req->newidx += arg2; } return (error); } /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. */ int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS) { int error, tries; u_int generation; struct sysctl_req req2; /* * Attempt to get a coherent snapshot, by using the thread * pre-emption counter updated from within mi_switch() to * determine if we were pre-empted during a bcopy() or * copyout(). Make 3 attempts at doing this before giving up. * If we encounter an error, stop immediately. */ tries = 0; req2 = *req; retry: generation = curthread->td_generation; error = SYSCTL_OUT(req, arg1, arg2); if (error) return (error); tries++; if (generation != curthread->td_generation && tries < 3) { *req = req2; goto retry; } error = SYSCTL_IN(req, arg1, arg2); return (error); } /* * Based on on sysctl_handle_int() convert microseconds to a sbintime. */ int sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS) { int error; int64_t tt; sbintime_t sb; tt = *(int64_t *)arg1; sb = sbttous(tt); error = sysctl_handle_64(oidp, &sb, 0, req); if (error || !req->newptr) return (error); tt = ustosbt(sb); *(int64_t *)arg1 = tt; return (0); } /* * Based on on sysctl_handle_int() convert milliseconds to a sbintime. */ int sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS) { int error; int64_t tt; sbintime_t sb; tt = *(int64_t *)arg1; sb = sbttoms(tt); error = sysctl_handle_64(oidp, &sb, 0, req); if (error || !req->newptr) return (error); tt = mstosbt(sb); *(int64_t *)arg1 = tt; return (0); } /* * Convert seconds to a struct timeval. Intended for use with * intervals and thus does not permit negative seconds. */ int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS) { struct timeval *tv; int error, secs; tv = arg1; secs = tv->tv_sec; error = sysctl_handle_int(oidp, &secs, 0, req); if (error || req->newptr == NULL) return (error); if (secs < 0) return (EINVAL); tv->tv_sec = secs; return (0); } /* * Transfer functions to/from kernel space. * XXX: rather untested at this point */ static int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) { size_t i = 0; if (req->oldptr) { i = l; if (req->oldlen <= req->oldidx) i = 0; else if (i > req->oldlen - req->oldidx) i = req->oldlen - req->oldidx; if (i > 0) bcopy(p, (char *)req->oldptr + req->oldidx, i); } req->oldidx += l; if (req->oldptr && i != l) return (ENOMEM); return (0); } static int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); bcopy((const char *)req->newptr + req->newidx, p, l); req->newidx += l; return (0); } int kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int error = 0; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr= old; } if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; error = sysctl_root(0, name, namelen, &req); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } int kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int oid[CTL_MAXNAME]; size_t oidlen, plen; int error; oid[0] = CTL_SYSCTL; oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, strlen(name), &plen, flags); if (error) return (error); error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp, new, newlen, retval, flags); return (error); } /* * Transfer function to/from user space. */ static int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) { size_t i, len, origidx; int error; origidx = req->oldidx; req->oldidx += l; if (req->oldptr == NULL) return (0); /* * If we have not wired the user supplied buffer and we are currently * holding locks, drop a witness warning, as it's possible that * write operations to the user page can sleep. */ if (req->lock != REQ_WIRED) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_old_user()"); i = l; len = req->validlen; if (len <= origidx) i = 0; else { if (i > len - origidx) i = len - origidx; if (req->lock == REQ_WIRED) { error = copyout_nofault(p, (char *)req->oldptr + origidx, i); } else error = copyout(p, (char *)req->oldptr + origidx, i); if (error != 0) return (error); } if (i < l) return (ENOMEM); return (0); } static int sysctl_new_user(struct sysctl_req *req, void *p, size_t l) { int error; if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_new_user()"); error = copyin((const char *)req->newptr + req->newidx, p, l); req->newidx += l; return (error); } /* * Wire the user space destination buffer. If set to a value greater than * zero, the len parameter limits the maximum amount of wired memory. */ int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len) { int ret; size_t wiredlen; wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen; ret = 0; if (req->lock != REQ_WIRED && req->oldptr && req->oldfunc == sysctl_old_user) { if (wiredlen != 0) { ret = vslock(req->oldptr, wiredlen); if (ret != 0) { if (ret != ENOMEM) return (ret); wiredlen = 0; } } req->lock = REQ_WIRED; req->validlen = wiredlen; } return (0); } int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, int *nindx, struct sysctl_req *req) { struct sysctl_oid_list *lsp; struct sysctl_oid *oid; int indx; SYSCTL_ASSERT_LOCKED(); lsp = &sysctl__children; indx = 0; while (indx < CTL_MAXNAME) { SLIST_FOREACH(oid, lsp, oid_link) { if (oid->oid_number == name[indx]) break; } if (oid == NULL) return (ENOENT); indx++; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oid->oid_handler != NULL || indx == namelen) { *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } lsp = SYSCTL_CHILDREN(oid); } else if (indx == namelen) { if ((oid->oid_kind & CTLFLAG_DORMANT) != 0) return (ENOENT); *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } else { return (ENOTDIR); } } return (ENOENT); } /* * Traverse our tree, and find the right node, execute whatever it points * to, and return the resulting error code. */ static int sysctl_root(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error, indx, lvl; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, &indx, req); if (error) goto out; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { /* * You can't call a sysctl when it's a node, but has * no handler. Inform the user that it's a node. * The indx may or may not be the same as namelen. */ if (oid->oid_handler == NULL) { error = EISDIR; goto out; } } /* Is this sysctl writable? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) { error = EPERM; goto out; } KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL")); #ifdef CAPABILITY_MODE /* * If the process is in capability mode, then don't permit reading or * writing unless specifically granted for the node. */ if (IN_CAPABILITY_MODE(req->td)) { if ((req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD)) || (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))) { error = EPERM; goto out; } } #endif /* Is this sysctl sensitive to securelevels? */ if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) { lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE; error = securelevel_gt(req->td->td_ucred, lvl); if (error) goto out; } /* Is this sysctl writable by only privileged users? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) { int priv; if (oid->oid_kind & CTLFLAG_PRISON) priv = PRIV_SYSCTL_WRITEJAIL; #ifdef VIMAGE else if ((oid->oid_kind & CTLFLAG_VNET) && prison_owns_vnet(req->td->td_ucred)) priv = PRIV_SYSCTL_WRITEJAIL; #endif else priv = PRIV_SYSCTL_WRITE; error = priv_check(req->td, priv); if (error) goto out; } if (!oid->oid_handler) { error = EINVAL; goto out; } if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { arg1 = (int *)arg1 + indx; arg2 -= indx; } else { arg1 = oid->oid_arg1; arg2 = oid->oid_arg2; } #ifdef MAC error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2, req); if (error != 0) goto out; #endif #ifdef VIMAGE if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL) arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); #endif error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker); out: SYSCTL_RUNLOCK(&tracker); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sysctl_args { int *name; u_int namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif int sys___sysctl(struct thread *td, struct sysctl_args *uap) { int error, i, name[CTL_MAXNAME]; size_t j; if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) return (EINVAL); error = copyin(uap->name, &name, uap->namelen * sizeof(int)); if (error) return (error); error = userland_sysctl(td, name, uap->namelen, uap->old, uap->oldlenp, 0, uap->new, uap->newlen, &j, 0); if (error && error != ENOMEM) return (error); if (uap->oldlenp) { i = copyout(&j, uap->oldlenp, sizeof(j)); if (i) return (i); } return (error); } int kern___sysctlbyname(struct thread *td, const char *oname, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags, bool inkernel) { int oid[CTL_MAXNAME]; char namebuf[16]; char *name; size_t oidlen; int error; if (namelen > MAXPATHLEN || namelen == 0) return (EINVAL); name = namebuf; if (namelen > sizeof(namebuf)) name = malloc(namelen, M_SYSCTL, M_WAITOK); error = copyin(oname, name, namelen); if (error != 0) goto out; oid[0] = CTL_SYSCTL; oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, namelen, retval, flags); if (error != 0) goto out; error = userland_sysctl(td, oid, *retval / sizeof(int), old, oldlenp, inkernel, new, newlen, retval, flags); out: if (namelen > sizeof(namebuf)) free(name, M_SYSCTL); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __sysctlbyname_args { const char *name; size_t namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif int sys___sysctlbyname(struct thread *td, struct __sysctlbyname_args *uap) { size_t rv; int error; error = kern___sysctlbyname(td, uap->name, uap->namelen, uap->old, uap->oldlenp, uap->new, uap->newlen, &rv, 0, 0); if (error != 0) return (error); if (uap->oldlenp != NULL) error = copyout(&rv, uap->oldlenp, sizeof(rv)); return (error); } /* * This is used from various compatibility syscalls too. That's why name * must be in kernel space. */ int userland_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, const void *new, size_t newlen, size_t *retval, int flags) { int error = 0, memlocked; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { if (inkernel) { req.oldlen = *oldlenp; } else { error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); if (error) return (error); } } req.validlen = req.oldlen; req.oldptr = old; if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_user; req.newfunc = sysctl_new_user; req.lock = REQ_UNWIRED; #ifdef KTRACE if (KTRPOINT(curthread, KTR_SYSCTL)) ktrsysctl(name, namelen); #endif memlocked = 0; if (req.oldptr && req.oldlen > 4 * PAGE_SIZE) { memlocked = 1; sx_xlock(&sysctlmemlock); } CURVNET_SET(TD_TO_VNET(td)); for (;;) { req.oldidx = 0; req.newidx = 0; error = sysctl_root(0, name, namelen, &req); if (error != EAGAIN) break; kern_yield(PRI_USER); } CURVNET_RESTORE(); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (memlocked) sx_xunlock(&sysctlmemlock); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } /* * Drain into a sysctl struct. The user buffer should be wired if a page * fault would cause issue. */ static int sbuf_sysctl_drain(void *arg, const char *data, int len) { struct sysctl_req *req = arg; int error; error = SYSCTL_OUT(req, data, len); KASSERT(error >= 0, ("Got unexpected negative value %d", error)); return (error == 0 ? len : -error); } struct sbuf * sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length, struct sysctl_req *req) { /* Supply a default buffer size if none given. */ if (buf == NULL && length == 0) length = 64; s = sbuf_new(s, buf, length, SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(s, sbuf_sysctl_drain, req); return (s); } #ifdef DDB /* The current OID the debugger is working with */ static struct sysctl_oid *g_ddb_oid; /* The current flags specified by the user */ static int g_ddb_sysctl_flags; /* Check to see if the last sysctl printed */ static int g_ddb_sysctl_printed; static const int ctl_sign[CTLTYPE+1] = { [CTLTYPE_INT] = 1, [CTLTYPE_LONG] = 1, [CTLTYPE_S8] = 1, [CTLTYPE_S16] = 1, [CTLTYPE_S32] = 1, [CTLTYPE_S64] = 1, }; static const int ctl_size[CTLTYPE+1] = { [CTLTYPE_INT] = sizeof(int), [CTLTYPE_UINT] = sizeof(u_int), [CTLTYPE_LONG] = sizeof(long), [CTLTYPE_ULONG] = sizeof(u_long), [CTLTYPE_S8] = sizeof(int8_t), [CTLTYPE_S16] = sizeof(int16_t), [CTLTYPE_S32] = sizeof(int32_t), [CTLTYPE_S64] = sizeof(int64_t), [CTLTYPE_U8] = sizeof(uint8_t), [CTLTYPE_U16] = sizeof(uint16_t), [CTLTYPE_U32] = sizeof(uint32_t), [CTLTYPE_U64] = sizeof(uint64_t), }; #define DB_SYSCTL_NAME_ONLY 0x001 /* Compare with -N */ #define DB_SYSCTL_VALUE_ONLY 0x002 /* Compare with -n */ #define DB_SYSCTL_OPAQUE 0x004 /* Compare with -o */ #define DB_SYSCTL_HEX 0x008 /* Compare with -x */ #define DB_SYSCTL_SAFE_ONLY 0x100 /* Only simple types */ static const char db_sysctl_modifs[] = { 'N', 'n', 'o', 'x', }; static const int db_sysctl_modif_values[] = { DB_SYSCTL_NAME_ONLY, DB_SYSCTL_VALUE_ONLY, DB_SYSCTL_OPAQUE, DB_SYSCTL_HEX, }; /* Handlers considered safe to print while recursing */ static int (* const db_safe_handlers[])(SYSCTL_HANDLER_ARGS) = { sysctl_handle_bool, sysctl_handle_8, sysctl_handle_16, sysctl_handle_32, sysctl_handle_64, sysctl_handle_int, sysctl_handle_long, sysctl_handle_string, sysctl_handle_opaque, }; /* * Use in place of sysctl_old_kernel to print sysctl values. * * Compare to the output handling in show_var from sbin/sysctl/sysctl.c */ static int sysctl_old_ddb(struct sysctl_req *req, const void *ptr, size_t len) { const u_char *val, *p; const char *sep1; size_t intlen, slen; uintmax_t umv; intmax_t mv; int sign, ctltype, hexlen, xflag, error; /* Suppress false-positive GCC uninitialized variable warnings */ mv = 0; umv = 0; slen = len; val = p = ptr; if (ptr == NULL) { error = 0; goto out; } /* We are going to print */ g_ddb_sysctl_printed = 1; xflag = g_ddb_sysctl_flags & DB_SYSCTL_HEX; ctltype = (g_ddb_oid->oid_kind & CTLTYPE); sign = ctl_sign[ctltype]; intlen = ctl_size[ctltype]; switch (ctltype) { case CTLTYPE_NODE: case CTLTYPE_STRING: db_printf("%.*s", (int) len, (const char *) p); error = 0; goto out; case CTLTYPE_INT: case CTLTYPE_UINT: case CTLTYPE_LONG: case CTLTYPE_ULONG: case CTLTYPE_S8: case CTLTYPE_S16: case CTLTYPE_S32: case CTLTYPE_S64: case CTLTYPE_U8: case CTLTYPE_U16: case CTLTYPE_U32: case CTLTYPE_U64: hexlen = 2 + (intlen * CHAR_BIT + 3) / 4; sep1 = ""; while (len >= intlen) { switch (ctltype) { case CTLTYPE_INT: case CTLTYPE_UINT: umv = *(const u_int *)p; mv = *(const int *)p; break; case CTLTYPE_LONG: case CTLTYPE_ULONG: umv = *(const u_long *)p; mv = *(const long *)p; break; case CTLTYPE_S8: case CTLTYPE_U8: umv = *(const uint8_t *)p; mv = *(const int8_t *)p; break; case CTLTYPE_S16: case CTLTYPE_U16: umv = *(const uint16_t *)p; mv = *(const int16_t *)p; break; case CTLTYPE_S32: case CTLTYPE_U32: umv = *(const uint32_t *)p; mv = *(const int32_t *)p; break; case CTLTYPE_S64: case CTLTYPE_U64: umv = *(const uint64_t *)p; mv = *(const int64_t *)p; break; } db_printf("%s", sep1); if (xflag) db_printf("%#0*jx", hexlen, umv); else if (!sign) db_printf("%ju", umv); else if (g_ddb_oid->oid_fmt[1] == 'K') { /* Kelvins are currently unsupported. */ error = EOPNOTSUPP; goto out; } else db_printf("%jd", mv); sep1 = " "; len -= intlen; p += intlen; } error = 0; goto out; case CTLTYPE_OPAQUE: /* TODO: Support struct functions. */ /* FALLTHROUGH */ default: db_printf("Format:%s Length:%zu Dump:0x", g_ddb_oid->oid_fmt, len); while (len-- && (xflag || p < val + 16)) db_printf("%02x", *p++); if (!xflag && len > 16) db_printf("..."); error = 0; goto out; } out: req->oldidx += slen; return (error); } /* * Avoid setting new sysctl values from the debugger */ static int sysctl_new_ddb(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); /* Changing sysctls from the debugger is currently unsupported */ return (EPERM); } /* * Run a sysctl handler with the DDB oldfunc and newfunc attached. * Instead of copying any output to a buffer we'll dump it right to * the console. */ static int db_sysctl(struct sysctl_oid *oidp, int *name, u_int namelen, void *old, size_t *oldlenp, size_t *retval, int flags) { struct sysctl_req req; int error; /* Setup the request */ bzero(&req, sizeof req); req.td = kdb_thread; req.oldfunc = sysctl_old_ddb; req.newfunc = sysctl_new_ddb; req.lock = REQ_UNWIRED; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr = old; } /* Setup our globals for sysctl_old_ddb */ g_ddb_oid = oidp; g_ddb_sysctl_flags = flags; g_ddb_sysctl_printed = 0; error = sysctl_root(0, name, namelen, &req); /* Reset globals */ g_ddb_oid = NULL; g_ddb_sysctl_flags = 0; if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } /* * Show a sysctl's name */ static void db_show_oid_name(int *oid, size_t nlen) { struct sysctl_oid *oidp; int qoid[CTL_MAXNAME+2]; int error; qoid[0] = 0; memcpy(qoid + 2, oid, nlen * sizeof(int)); qoid[1] = 1; error = sysctl_find_oid(qoid, nlen + 2, &oidp, NULL, NULL); if (error) db_error("sysctl name oid"); error = db_sysctl(oidp, qoid, nlen + 2, NULL, NULL, NULL, 0); if (error) db_error("sysctl name"); } /* * Check to see if an OID is safe to print from ddb. */ static bool db_oid_safe(const struct sysctl_oid *oidp) { for (unsigned int i = 0; i < nitems(db_safe_handlers); ++i) { if (oidp->oid_handler == db_safe_handlers[i]) return (true); } return (false); } /* * Show a sysctl at a specific OID * Compare to the input handling in show_var from sbin/sysctl/sysctl.c */ static int db_show_oid(struct sysctl_oid *oidp, int *oid, size_t nlen, int flags) { int error, xflag, oflag, Nflag, nflag; size_t len; xflag = flags & DB_SYSCTL_HEX; oflag = flags & DB_SYSCTL_OPAQUE; nflag = flags & DB_SYSCTL_VALUE_ONLY; Nflag = flags & DB_SYSCTL_NAME_ONLY; if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_OPAQUE && (!xflag && !oflag)) return (0); if (Nflag) { db_show_oid_name(oid, nlen); error = 0; goto out; } if (!nflag) { db_show_oid_name(oid, nlen); db_printf(": "); } if ((flags & DB_SYSCTL_SAFE_ONLY) && !db_oid_safe(oidp)) { db_printf("Skipping, unsafe to print while recursing."); error = 0; goto out; } /* Try once, and ask about the size */ len = 0; error = db_sysctl(oidp, oid, nlen, NULL, NULL, &len, flags); if (error) goto out; if (!g_ddb_sysctl_printed) /* Lie about the size */ error = db_sysctl(oidp, oid, nlen, (void *) 1, &len, NULL, flags); out: db_printf("\n"); return (error); } /* * Show all sysctls under a specific OID * Compare to sysctl_all from sbin/sysctl/sysctl.c */ static int db_show_sysctl_all(int *oid, size_t len, int flags) { struct sysctl_oid *oidp; int name1[CTL_MAXNAME + 2], name2[CTL_MAXNAME + 2]; size_t l1, l2; name1[0] = CTL_SYSCTL; name1[1] = CTL_SYSCTL_NEXT; l1 = 2; if (len) { memcpy(name1 + 2, oid, len * sizeof(int)); l1 += len; } else { name1[2] = CTL_KERN; l1++; } for (;;) { int i, error; l2 = sizeof(name2); error = kernel_sysctl(kdb_thread, name1, l1, name2, &l2, NULL, 0, &l2, 0); if (error != 0) { if (error == ENOENT) return (0); else db_error("sysctl(next)"); } l2 /= sizeof(int); if (l2 < (unsigned int)len) return (0); for (i = 0; i < len; i++) if (name2[i] != oid[i]) return (0); /* Find the OID in question */ error = sysctl_find_oid(name2, l2, &oidp, NULL, NULL); if (error) return (error); i = db_show_oid(oidp, name2, l2, flags | DB_SYSCTL_SAFE_ONLY); if (db_pager_quit) return (0); memcpy(name1+2, name2, l2 * sizeof(int)); l1 = 2 + l2; } } /* * Show a sysctl by its user facing string */ static int db_sysctlbyname(char *name, int flags) { struct sysctl_oid *oidp; int oid[CTL_MAXNAME]; int error, nlen; error = name2oid(name, oid, &nlen, &oidp); if (error) { return (error); } if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { db_show_sysctl_all(oid, nlen, flags); } else { error = db_show_oid(oidp, oid, nlen, flags); } return (error); } static void db_sysctl_cmd_usage(void) { db_printf( " sysctl [/Nnox] \n" " \n" " The name of the sysctl to show. \n" " \n" " Show a sysctl by hooking into SYSCTL_IN and SYSCTL_OUT. \n" " This will work for most sysctls, but should not be used \n" " with sysctls that are known to malloc. \n" " \n" " While recursing any \"unsafe\" sysctls will be skipped. \n" " Call sysctl directly on the sysctl to try printing the \n" " skipped sysctl. This is unsafe and may make the ddb \n" " session unusable. \n" " \n" " Arguments: \n" " /N Display only the name of the sysctl. \n" " /n Display only the value of the sysctl. \n" " /o Display opaque values. \n" " /x Display the sysctl in hex. \n" " \n" "For example: \n" "sysctl vm.v_free_min \n" "vn.v_free_min: 12669 \n" ); } /* * Show a specific sysctl similar to sysctl (8). */ DB_FUNC(sysctl, db_sysctl_cmd, db_cmd_table, CS_OWN, NULL) { char name[TOK_STRING_SIZE]; int error, i, t, flags; /* Parse the modifiers */ t = db_read_token(); if (t == tSLASH || t == tMINUS) { t = db_read_token(); if (t != tIDENT) { db_printf("Bad modifier\n"); error = EINVAL; goto out; } db_strcpy(modif, db_tok_string); } else { db_unread_token(t); modif[0] = '\0'; } flags = 0; for (i = 0; i < nitems(db_sysctl_modifs); i++) { if (strchr(modif, db_sysctl_modifs[i])) { flags |= db_sysctl_modif_values[i]; } } /* Parse the sysctl names */ t = db_read_token(); if (t != tIDENT) { db_printf("Need sysctl name\n"); error = EINVAL; goto out; } /* Copy the name into a temporary buffer */ db_strcpy(name, db_tok_string); /* Ensure there is no trailing cruft */ t = db_read_token(); if (t != tEOL) { db_printf("Unexpected sysctl argument\n"); error = EINVAL; goto out; } error = db_sysctlbyname(name, flags); if (error == ENOENT) { db_printf("unknown oid: '%s'\n", db_tok_string); goto out; } else if (error) { db_printf("%s: error: %d\n", db_tok_string, error); goto out; } out: /* Ensure we eat all of our text */ db_flush_lex(); if (error == EINVAL) { db_sysctl_cmd_usage(); } } #endif /* DDB */ diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 7e328454c877..7ca67c390b91 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -1,1088 +1,1088 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * Modifications/enhancements: * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); static uma_zone_t cluster_pbuf_zone; static void cluster_init(void *); static struct cluster_save *cluster_collectbufs(struct vnode *vp, struct vn_clusterw *vnc, struct buf *last_bp, int gbflags); static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp); static void cluster_callback(struct buf *); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); static int read_max = 64; SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, "Cluster read-ahead max block count"); static int read_min = 1; SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0, "Cluster read min block count"); SYSINIT(cluster, SI_SUB_CPU, SI_ORDER_ANY, cluster_init, NULL); static void cluster_init(void *dummy) { cluster_pbuf_zone = pbuf_zsecond_create("clpbuf", nswbuf / 2); } /* * Read data to a buf, including read-ahead if we find this to be beneficial. * cluster_read replaces bread. */ int cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, struct ucred *cred, long totread, int seqcount, int gbflags, struct buf **bpp) { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; struct thread *td; daddr_t blkno, origblkno; int maxra, racluster; int error, ncontig; int i; error = 0; td = curthread; bo = &vp->v_bufobj; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; /* * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = seqcount; maxra = min(read_max, maxra); maxra = min(nbuf/8, maxra); if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) maxra = (filesize / size) - lblkno; /* * get the requested block */ error = getblkx(vp, lblkno, lblkno, size, 0, 0, gbflags, &bp); if (error != 0) { *bpp = NULL; return (error); } gbflags &= ~GB_NOSPARSE; origblkno = lblkno; *bpp = reqbp = bp; /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seqcount) { return 0; } else if ((bp->b_flags & B_RAM) == 0) { return 0; } else { bp->b_flags &= ~B_RAM; BO_RLOCK(bo); for (i = 1; i < maxra; i++) { /* * Stop if the buffer does not exist or it * is invalid (about to go away?) */ rbp = gbincore(&vp->v_bufobj, lblkno+i); if (rbp == NULL || (rbp->b_flags & B_INVAL)) break; /* * Set another read-ahead mark so we know * to check again. (If we can lock the * buffer without waiting) */ if ((((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) && (0 == BUF_LOCK(rbp, LK_EXCLUSIVE | LK_NOWAIT, NULL))) { rbp->b_flags |= B_RAM; BUF_UNLOCK(rbp); } } BO_RUNLOCK(bo); if (i >= maxra) { return 0; } lblkno += i; } reqbp = bp = NULL; /* * If it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ } else { off_t firstread = bp->b_offset; int nblks; long minread; KASSERT(bp->b_offset != NOOFFSET, ("cluster_read: no buffer offset")); ncontig = 0; /* * Adjust totread if needed */ minread = read_min * size; if (minread > totread) totread = minread; /* * Compute the total number of blocks that we should read * synchronously. */ if (firstread + totread > filesize) totread = filesize - firstread; nblks = howmany(totread, size); if (nblks > racluster) nblks = racluster; /* * Now compute the number of contiguous blocks. */ if (nblks > 1) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); /* * If this failed to map just do the original block. */ if (error || blkno == -1) ncontig = 0; } /* * If we have contiguous data available do a cluster * otherwise just read the requested block. */ if (ncontig) { /* Account for our first block. */ ncontig = min(ncontig + 1, nblks); if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; bp->b_iocmd = BIO_READ; lblkno += 1; } } /* * handle the synchronous read so that it is available ASAP. */ if (bp) { if ((bp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(bp, 0); } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, bp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; } /* * If we have been doing sequential I/O, then do some read-ahead. */ while (lblkno < (origblkno + maxra)) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); if (error) break; if (blkno == -1) break; /* * We could throttle ncontig here by maxra but we might as * well read the data if it is contiguous. We're throttled * by racluster anyway. */ if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } rbp->b_flags |= B_ASYNC | B_RAM; rbp->b_iocmd = BIO_READ; rbp->b_blkno = blkno; } if (rbp->b_flags & B_CACHE) { rbp->b_flags &= ~B_ASYNC; bqrelse(rbp); continue; } if ((rbp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(rbp, 0); } rbp->b_flags &= ~B_INVAL; rbp->b_ioflags &= ~BIO_ERROR; if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) BUF_KERNPROC(rbp); rbp->b_iooffset = dbtob(rbp->b_blkno); bstrategy(rbp); #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, rbp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; } if (reqbp) { /* * Like bread, always brelse() the buffer when * returning an error. */ error = bufwait(reqbp); if (error != 0) { brelse(reqbp); *bpp = NULL; } } return (error); } /* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) { struct buf *bp, *tbp; daddr_t bn; off_t off; long tinc, tsize; int i, inc, j, k, toff; KASSERT(size == vp->v_mount->mnt_stat.f_iosize, ("cluster_rbuild: size %ld != f_iosize %jd\n", size, (intmax_t)vp->v_mount->mnt_stat.f_iosize)); /* * avoid a division */ while ((u_quad_t) size * (lbn + run) > filesize) { --run; } if (fbp) { tbp = fbp; tbp->b_iocmd = BIO_READ; } else { tbp = getblk(vp, lbn, size, 0, 0, gbflags); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; tbp->b_iocmd = BIO_READ; } tbp->b_blkno = blkno; - if( (tbp->b_flags & B_MALLOC) || + if ( (tbp->b_flags & B_MALLOC) || ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT); if (bp == NULL) return tbp; MPASS((bp->b_flags & B_MAXPHYS) != 0); /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; } else { bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); } bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; bp->b_lblkno = lbn; bp->b_offset = tbp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); pbgetvp(vp, bp); TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; inc = btodb(size); for (bn = blkno, i = 0; i < run; ++i, bn += inc) { if (i == 0) { vm_object_pip_add(tbp->b_bufobj->bo_object, tbp->b_npages); vfs_busy_pages_acquire(tbp); } else { if ((bp->b_npages * PAGE_SIZE) + round_page(size) > vp->v_mount->mnt_iosize_max) { break; } tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT | (gbflags & GB_UNMAPPED)); /* Don't wait around for locked bufs. */ if (tbp == NULL) break; /* * Stop scanning if the buffer is fully valid * (marked B_CACHE), or locked (may be doing a * background write), or if the buffer is not * VMIO backed. The clustering code can only deal * with VMIO-backed buffers. The bo lock is not * required for the BKGRDINPROG check since it * can not be set without the buf lock. */ if ((tbp->b_vflags & BV_BKGRDINPROG) || (tbp->b_flags & B_CACHE) || (tbp->b_flags & B_VMIO) == 0) { bqrelse(tbp); break; } /* * The buffer must be completely invalid in order to * take part in the cluster. If it is partially valid * then we stop. */ off = tbp->b_offset; tsize = size; for (j = 0; tsize > 0; j++) { toff = off & PAGE_MASK; tinc = tsize; if (toff + tinc > PAGE_SIZE) tinc = PAGE_SIZE - toff; if (vm_page_trysbusy(tbp->b_pages[j]) == 0) break; if ((tbp->b_pages[j]->valid & vm_page_bits(toff, tinc)) != 0) { vm_page_sunbusy(tbp->b_pages[j]); break; } vm_object_pip_add(tbp->b_bufobj->bo_object, 1); off += tinc; tsize -= tinc; } if (tsize > 0) { clean_sbusy: vm_object_pip_wakeupn(tbp->b_bufobj->bo_object, j); for (k = 0; k < j; k++) vm_page_sunbusy(tbp->b_pages[k]); bqrelse(tbp); break; } /* * Set a read-ahead mark as appropriate */ if ((fbp && (i == 1)) || (i == (run - 1))) tbp->b_flags |= B_RAM; /* * Set the buffer up for an async read (XXX should * we do this only if we do not wind up brelse()ing?). * Set the block number if it isn't set, otherwise * if it is make sure it matches the block number we * expect. */ tbp->b_flags |= B_ASYNC; tbp->b_iocmd = BIO_READ; if (tbp->b_blkno == tbp->b_lblkno) { tbp->b_blkno = bn; } else if (tbp->b_blkno != bn) { goto clean_sbusy; } } /* * XXX fbp from caller may not be B_ASYNC, but we are going * to biodone() it in cluster_callback() anyway */ BUF_KERNPROC(tbp); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); for (j = 0; j < tbp->b_npages; j += 1) { vm_page_t m; m = tbp->b_pages[j]; if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages-1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } if (vm_page_all_valid(m)) tbp->b_pages[j] = bogus_page; } /* * Don't inherit tbp->b_bufsize as it may be larger due to * a non-page-aligned size. Instead just aggregate using * 'size'. */ if (tbp->b_bcount != size) printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); if (tbp->b_bufsize != size) printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); bp->b_bcount += size; bp->b_bufsize += size; } /* * Fully valid pages in the cluster are already good and do not need * to be re-read from disk. Replace the page with bogus_page */ for (j = 0; j < bp->b_npages; j++) { if (vm_page_all_valid(bp->b_pages[j])) bp->b_pages[j] = bogus_page; } if (bp->b_bufsize > bp->b_kvasize) panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); if (buf_mapped(bp)) { pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); } return (bp); } /* * Cleanup after a clustered read or write. * This is complicated by the fact that any of the buffers might have * extra memory (if there were no empty buffer headers at allocbuf time) * that we will need to shift around. */ static void cluster_callback(struct buf *bp) { struct buf *nbp, *tbp; int error = 0; /* * Must propagate errors to all the components. */ if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; if (buf_mapped(bp)) { pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); } /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); tbp; tbp = nbp) { nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); if (error) { tbp->b_ioflags |= BIO_ERROR; tbp->b_error = error; } else { tbp->b_dirtyoff = tbp->b_dirtyend = 0; tbp->b_flags &= ~B_INVAL; tbp->b_ioflags &= ~BIO_ERROR; /* * XXX the bdwrite()/bqrelse() issued during * cluster building clears B_RELBUF (see bqrelse() * comment). If direct I/O was specified, we have * to restore it here to allow the buffer and VM * to be freed. */ if (tbp->b_flags & B_DIRECT) tbp->b_flags |= B_RELBUF; } bufdone(tbp); } pbrelvp(bp); uma_zfree(cluster_pbuf_zone, bp); } /* * cluster_wbuild_wb: * * Implement modified write build for cluster. * * write_behind = 0 write behind disabled * write_behind = 1 write behind normal (default) * write_behind = 2 write behind backed-off */ static __inline int cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags) { int r = 0; switch (write_behind) { case 2: if (start_lbn < len) break; start_lbn -= len; /* FALLTHROUGH */ case 1: r = cluster_wbuild(vp, size, start_lbn, len, gbflags); /* FALLTHROUGH */ default: /* FALLTHROUGH */ break; } return(r); } /* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(struct vnode *vp, struct vn_clusterw *vnc, struct buf *bp, u_quad_t filesize, int seqcount, int gbflags) { daddr_t lbn; int maxclen, cursize; int lblocksize; int async; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; if (vp->v_type == VREG) { async = DOINGASYNC(vp); lblocksize = vp->v_mount->mnt_stat.f_iosize; } else { async = 0; lblocksize = bp->b_bufsize; } lbn = bp->b_lblkno; KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); /* Initialize vnode to beginning of file. */ if (lbn == 0) vnc->v_lasta = vnc->v_clen = vnc->v_cstart = vnc->v_lastw = 0; if (vnc->v_clen == 0 || lbn != vnc->v_lastw + 1 || (bp->b_blkno != vnc->v_lasta + btodb(lblocksize))) { maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; if (vnc->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its last * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. * * Change to algorithm: only push previous cluster if * it was sequential from the point of view of the * seqcount heuristic, otherwise leave the buffer * intact so we can potentially optimize the I/O * later on in the buf_daemon or update daemon * flush. */ cursize = vnc->v_lastw - vnc->v_cstart + 1; if ((u_quad_t)bp->b_offset + lblocksize != filesize || lbn != vnc->v_lastw + 1 || vnc->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, vnc->v_cstart, cursize, gbflags); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; buflist = cluster_collectbufs(vp, vnc, bp, gbflags); if (buflist == NULL) { /* * Cluster build failed so just write * it now. */ bawrite(bp); return; } endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* * Failed, push the previous cluster * if *really* writing sequentially * in the logical file (seqcount > 1), * otherwise delay it in the hopes that * the low level disk driver can * optimize the write ordering. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vnc->v_cstart, cursize, gbflags); } } else { /* * Succeeded, keep building cluster. */ for (bpp = buflist->bs_children; bpp <= endbp; bpp++) bdwrite(*bpp); free(buflist, M_SEGMENT); vnc->v_lastw = lbn; vnc->v_lasta = bp->b_blkno; return; } } } /* * Consider beginning a cluster. If at end of file, make * cluster as large as possible, otherwise find size of * existing cluster. */ if (vp->v_type == VREG && (u_quad_t) bp->b_offset + lblocksize != filesize && bp->b_blkno == bp->b_lblkno && (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) != 0 || bp->b_blkno == -1)) { bawrite(bp); vnc->v_clen = 0; vnc->v_lasta = bp->b_blkno; vnc->v_cstart = lbn + 1; vnc->v_lastw = lbn; return; } vnc->v_clen = maxclen; if (!async && maxclen == 0) { /* I/O not contiguous */ vnc->v_cstart = lbn + 1; bawrite(bp); } else { /* Wait for rest of cluster */ vnc->v_cstart = lbn; bdwrite(bp); } } else if (lbn == vnc->v_cstart + vnc->v_clen) { /* * At end of cluster, write it out if seqcount tells us we * are operating sequentially, otherwise let the buf or * update daemon handle it. */ bdwrite(bp); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vnc->v_cstart, vnc->v_clen + 1, gbflags); } vnc->v_clen = 0; vnc->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { /* * We are low on memory, get it going NOW */ bawrite(bp); } else { /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); } vnc->v_lastw = lbn; vnc->v_lasta = bp->b_blkno; } /* * This is an awful lot like cluster_rbuild...wish they could be combined. * The last lbn argument is the current block on which I/O is being * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ int cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags) { struct buf *bp, *tbp; struct bufobj *bo; int i, j; int totalwritten = 0; int dbsize = btodb(size); if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; bo = &vp->v_bufobj; while (len > 0) { /* * If the buffer is not delayed-write (i.e. dirty), or it * is delayed-write but either locked or inval, it cannot * partake in the clustered write. */ BO_LOCK(bo); if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { BO_UNLOCK(bo); ++start_lbn; --len; continue; } if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) { ++start_lbn; --len; continue; } if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { BUF_UNLOCK(tbp); ++start_lbn; --len; continue; } bremfree(tbp); tbp->b_flags &= ~B_DONE; /* * Extra memory in the buffer, punt on this buffer. * XXX we could handle this in most cases, but we would * have to push the extra memory down to after our max * possible cluster size and then potentially pull it back * up if the cluster was terminated prematurely--too much * hassle. */ if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != (B_CLUSTEROK | B_VMIO)) || (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || ((bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; --len; continue; } MPASS((bp->b_flags & B_MAXPHYS) != 0); /* * We got a pbuf to make the cluster in. * so initialise it. */ TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; if (tbp->b_wcred != NOCRED) bp->b_wcred = crhold(tbp->b_wcred); bp->b_blkno = tbp->b_blkno; bp->b_lblkno = tbp->b_lblkno; bp->b_offset = tbp->b_offset; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ if ((gbflags & GB_UNMAPPED) == 0 || (tbp->b_flags & B_VMIO) == 0) { bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); } else { bp->b_data = unmapped_buf; } bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* * From this location in the file, scan forward to see * if there are buffers with adjacent data that need to * be written as well. */ for (i = 0; i < len; ++i, ++start_lbn) { if (i != 0) { /* If not the first buffer */ /* * If the adjacent data is not even in core it * can't need to be written. */ BO_LOCK(bo); if ((tbp = gbincore(bo, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { BO_UNLOCK(bo); break; } /* * If it IS in core, but has different * characteristics, or is locked (which * means it could be undergoing a background * I/O or be in a weird state), then don't * cluster with it. */ if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) break; if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | B_INVAL | B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_CLUSTEROK | (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || tbp->b_wcred != bp->b_wcred) { BUF_UNLOCK(tbp); break; } /* * Check that the combined cluster * would make sense with regard to pages * and would not be too large */ if ((tbp->b_bcount != size) || ((bp->b_blkno + (dbsize * i)) != tbp->b_blkno) || ((tbp->b_npages + bp->b_npages) > (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { BUF_UNLOCK(tbp); break; } /* * Ok, it's passed all the tests, * so remove it from the free list * and mark it busy. We will use it. */ bremfree(tbp); tbp->b_flags &= ~B_DONE; } /* end of code for non-first buffers only */ /* * If the IO is via the VM then we do some * special VM hackery (yuck). Since the buffer's * block size may not be page-aligned it is possible * for a page to be shared between two buffers. We * have to get rid of the duplication when building * the cluster. */ if (tbp->b_flags & B_VMIO) { vm_page_t m; if (i == 0) { vfs_busy_pages_acquire(tbp); } else { /* if not first buffer */ for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; if (vm_page_trysbusy(m) == 0) { for (j--; j >= 0; j--) vm_page_sunbusy( tbp->b_pages[j]); bqrelse(tbp); goto finishcluster; } } } vm_object_pip_add(tbp->b_bufobj->bo_object, tbp->b_npages); for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages - 1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } } } bp->b_bcount += size; bp->b_bufsize += size; /* * If any of the clustered buffers have their * B_BARRIER flag set, transfer that request to * the cluster. */ bp->b_flags |= (tbp->b_flags & B_BARRIER); tbp->b_flags &= ~(B_DONE | B_BARRIER); tbp->b_flags |= B_ASYNC; tbp->b_ioflags &= ~BIO_ERROR; tbp->b_iocmd = BIO_WRITE; bundirty(tbp); reassignbuf(tbp); /* put on clean list */ bufobj_wref(tbp->b_bufobj); BUF_KERNPROC(tbp); buf_track(tbp, __func__); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); } finishcluster: if (buf_mapped(bp)) { pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); } if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); totalwritten += bp->b_bufsize; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bufsize; bawrite(bp); len -= i; } return totalwritten; } /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ static struct cluster_save * cluster_collectbufs(struct vnode *vp, struct vn_clusterw *vnc, struct buf *last_bp, int gbflags) { struct cluster_save *buflist; struct buf *bp; daddr_t lbn; int i, j, len, error; len = vnc->v_lastw - vnc->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vnc->v_cstart, i = 0; i < len; lbn++, i++) { error = bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, gbflags, &bp); if (error != 0) { /* * If read fails, release collected buffers * and return failure. */ for (j = 0; j < i; j++) brelse(buflist->bs_children[j]); free(buflist, M_SEGMENT); return (NULL); } buflist->bs_children[i] = bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buflist->bs_children[i] = bp = last_bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); buflist->bs_nchildren = i + 1; return (buflist); } void cluster_init_vn(struct vn_clusterw *vnc) { vnc->v_lasta = 0; vnc->v_clen = 0; vnc->v_cstart = 0; vnc->v_lastw = 0; } diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c index 3b968fd19bbd..65d52cc68bcd 100644 --- a/sys/kern/vfs_mountroot.c +++ b/sys/kern/vfs_mountroot.c @@ -1,1160 +1,1159 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2010 Marcel Moolenaar * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_rootdevname.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[][ :[] ...] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store * * If the environment variable vfs.root.mountfrom is a space separated list, * each list element is tried in turn and the root filesystem will be mounted * from the first one that succeeds. * * The environment variable vfs.root.mountfrom.options is a comma delimited * set of string mount options. These mount options must be parseable * by nmount() in the kernel. */ static int parse_mount(char **); static struct mntarg *parse_mountroot_options(struct mntarg *, const char *); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS); static void vfs_mountroot_wait(void); static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev); /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * Mount of the system's /dev. */ struct mount *rootdevmp; char *rootdevnames[2] = {NULL, NULL}; struct mtx root_holds_mtx; MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF); static TAILQ_HEAD(, root_hold_token) root_holds = TAILQ_HEAD_INITIALIZER(root_holds); enum action { A_CONTINUE, A_PANIC, A_REBOOT, A_RETRY }; enum rh_flags { RH_FREE, RH_ALLOC, RH_ARG, }; static enum action root_mount_onfail = A_CONTINUE; static int root_mount_mddev; static int root_mount_complete; /* By default wait up to 3 seconds for devices to appear. */ static int root_mount_timeout = 3; TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout); static int root_mount_always_wait = 0; SYSCTL_INT(_vfs, OID_AUTO, root_mount_always_wait, CTLFLAG_RDTUN, &root_mount_always_wait, 0, "Wait for root mount holds even if the root device already exists"); SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_root_mount_hold, "A", "List of root mount hold tokens"); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct root_hold_token *h; int error; sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); mtx_lock(&root_holds_mtx); TAILQ_FOREACH(h, &root_holds, list) { if (h != TAILQ_FIRST(&root_holds)) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s", h->who); } mtx_unlock(&root_holds_mtx); error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return (error); } struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->flags = RH_ALLOC; h->who = identifier; mtx_lock(&root_holds_mtx); TSHOLD("root mount"); TAILQ_INSERT_TAIL(&root_holds, h, list); mtx_unlock(&root_holds_mtx); return (h); } void root_mount_hold_token(const char *identifier, struct root_hold_token *h) { #ifdef INVARIANTS struct root_hold_token *t; #endif h->flags = RH_ARG; h->who = identifier; mtx_lock(&root_holds_mtx); #ifdef INVARIANTS TAILQ_FOREACH(t, &root_holds, list) { if (t == h) { panic("Duplicate mount hold by '%s' on %p", identifier, h); } } #endif TSHOLD("root mount"); TAILQ_INSERT_TAIL(&root_holds, h, list); mtx_unlock(&root_holds_mtx); } void root_mount_rel(struct root_hold_token *h) { if (h == NULL || h->flags == RH_FREE) return; mtx_lock(&root_holds_mtx); TAILQ_REMOVE(&root_holds, h, list); TSRELEASE("root mount"); wakeup(&root_holds); mtx_unlock(&root_holds_mtx); if (h->flags == RH_ALLOC) { free(h, M_DEVBUF); } else h->flags = RH_FREE; } int root_mounted(void) { /* No mutex is acquired here because int stores are atomic. */ return (root_mount_complete); } static void set_rootvnode(void) { if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode)) panic("set_rootvnode: Cannot find root vnode"); VOP_UNLOCK(rootvnode); pwd_set_rootvnode(); } static int vfs_mountroot_devfs(struct thread *td, struct mount **mpp) { struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp; int error; *mpp = NULL; if (rootdevmp != NULL) { /* * Already have /dev; this happens during rerooting. */ error = vfs_busy(rootdevmp, 0); if (error != 0) return (error); *mpp = rootdevmp; } else { vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return (ENOENT); mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred); error = VFS_MOUNT(mp); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return (error); error = VFS_STATFS(mp, &mp->mnt_stat); KASSERT(error == 0, ("VFS_STATFS(devfs) failed %d", error)); if (error) return (error); opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); *mpp = mp; rootdevmp = mp; vfs_op_exit(mp); } set_rootvnode(); error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); return (error); } static void vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs) { struct nameidata nd; struct mount *mporoot, *mpnroot; struct vnode *vp, *vporoot, *vpdevfs; char *fspath; int error; mpnroot = TAILQ_NEXT(mpdevfs, mnt_list); /* Shuffle the mountlist. */ mtx_lock(&mountlist_mtx); mporoot = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list); if (mporoot != mpdevfs) { TAILQ_REMOVE(&mountlist, mpnroot, mnt_list); TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list); } TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list); mtx_unlock(&mountlist_mtx); cache_purgevfs(mporoot); if (mporoot != mpdevfs) cache_purgevfs(mpdevfs); if (VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot)) panic("vfs_mountroot_shuffle: Cannot find root vnode"); VI_LOCK(vporoot); vporoot->v_iflag &= ~VI_MOUNT; vn_irflag_unset_locked(vporoot, VIRF_MOUNTPOINT); vporoot->v_mountedhere = NULL; VI_UNLOCK(vporoot); mporoot->mnt_flag &= ~MNT_ROOTFS; mporoot->mnt_vnodecovered = NULL; vput(vporoot); /* Set up the new rootvnode, and purge the cache */ mpnroot->mnt_vnodecovered = NULL; set_rootvnode(); cache_purgevfs(rootvnode->v_mount); if (mporoot != mpdevfs) { /* Remount old root under /.mount or /mnt */ fspath = "/.mount"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); fspath = "/mnt"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); } if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { cache_purge(vp); mporoot->mnt_vnodecovered = vp; vp->v_mountedhere = mporoot; strlcpy(mporoot->mnt_stat.f_mntonname, fspath, MNAMELEN); VOP_UNLOCK(vp); } else vput(vp); } NDFREE(&nd, NDF_ONLY_PNBUF); if (error) printf("mountroot: unable to remount previous root " "under /.mount or /mnt (error %d)\n", error); } /* Remount devfs under /dev */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td); error = namei(&nd); if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { vpdevfs = mpdevfs->mnt_vnodecovered; if (vpdevfs != NULL) { cache_purge(vpdevfs); VI_LOCK(vpdevfs); vn_irflag_unset_locked(vpdevfs, VIRF_MOUNTPOINT); vpdevfs->v_mountedhere = NULL; VI_UNLOCK(vpdevfs); vrele(vpdevfs); } VI_LOCK(vp); mpdevfs->mnt_vnodecovered = vp; vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); vp->v_mountedhere = mpdevfs; VI_UNLOCK(vp); VOP_UNLOCK(vp); } else vput(vp); } if (error) printf("mountroot: unable to remount devfs under /dev " "(error %d)\n", error); NDFREE(&nd, NDF_ONLY_PNBUF); if (mporoot == mpdevfs) { vfs_unbusy(mpdevfs); /* Unlink the no longer needed /dev/dev -> / symlink */ error = kern_funlinkat(td, AT_FDCWD, "/dev/dev", FD_NONE, UIO_SYSSPACE, 0, 0); if (error) printf("mountroot: unable to unlink /dev/dev " "(error %d)\n", error); } } /* * Configuration parser. */ /* Parser character classes. */ #define CC_WHITESPACE -1 #define CC_NONWHITESPACE -2 /* Parse errors. */ #define PE_EOF -1 #define PE_EOL -2 static __inline int parse_peek(char **conf) { return (**conf); } static __inline void parse_poke(char **conf, int c) { **conf = c; } static __inline void parse_advance(char **conf) { (*conf)++; } static int parse_skipto(char **conf, int mc) { int c, match; while (1) { c = parse_peek(conf); if (c == 0) return (PE_EOF); switch (mc) { case CC_WHITESPACE: match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0; break; case CC_NONWHITESPACE: if (c == '\n') return (PE_EOL); match = (c != ' ' && c != '\t') ? 1 : 0; break; default: match = (c == mc) ? 1 : 0; break; } if (match) break; parse_advance(conf); } return (0); } static int parse_token(char **conf, char **tok) { char *p; size_t len; int error; *tok = NULL; error = parse_skipto(conf, CC_NONWHITESPACE); if (error) return (error); p = *conf; error = parse_skipto(conf, CC_WHITESPACE); len = *conf - p; *tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO); bcopy(p, *tok, len); return (0); } static void parse_dir_ask_printenv(const char *var) { char *val; val = kern_getenv(var); if (val != NULL) { printf(" %s=%s\n", var, val); freeenv(val); } } static int parse_dir_ask(char **conf) { char name[80]; char *mnt; int error; vfs_mountroot_wait(); printf("\nLoader variables:\n"); parse_dir_ask_printenv("vfs.root.mountfrom"); parse_dir_ask_printenv("vfs.root.mountfrom.options"); printf("\nManual root filesystem specification:\n"); printf(" : [options]\n"); printf(" Mount using filesystem \n"); printf(" and with the specified (optional) option list.\n"); printf("\n"); printf(" eg. ufs:/dev/da0s1a\n"); printf(" zfs:zroot/ROOT/default\n"); printf(" cd9660:/dev/cd0 ro\n"); printf(" (which is equivalent to: "); printf("mount -t cd9660 -o ro /dev/cd0 /)\n"); printf("\n"); printf(" ? List valid disk boot devices\n"); printf(" . Yield 1 second (for background tasks)\n"); printf(" Abort manual input\n"); do { error = EINVAL; printf("\nmountroot> "); cngets(name, sizeof(name), GETS_ECHO); if (name[0] == '\0') break; if (name[0] == '?' && name[1] == '\0') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (name[0] == '.' && name[1] == '\0') { pause("rmask", hz); continue; } mnt = name; error = parse_mount(&mnt); if (error == -1) printf("Invalid file system specification.\n"); } while (error != 0); return (error); } static int parse_dir_md(char **conf) { struct stat sb; struct thread *td; struct md_ioctl *mdio; char *path, *tok; int error, fd, len; td = curthread; error = parse_token(conf, &tok); if (error) return (error); len = strlen(tok); mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO); path = (void *)(mdio + 1); bcopy(tok, path, len); free(tok, M_TEMP); /* Get file status. */ error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL); if (error) goto out; /* Open /dev/mdctl so that we can attach/detach. */ error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE, O_RDWR, 0); if (error) goto out; fd = td->td_retval[0]; mdio->md_version = MDIOVERSION; mdio->md_type = MD_VNODE; if (root_mount_mddev != -1) { mdio->md_unit = root_mount_mddev; (void)kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); /* Ignore errors. We don't care. */ root_mount_mddev = -1; } mdio->md_file = (void *)(mdio + 1); mdio->md_options = MD_AUTOUNIT | MD_READONLY; mdio->md_mediasize = sb.st_size; mdio->md_unit = 0; error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio); if (error) goto out; if (mdio->md_unit > 9) { printf("rootmount: too many md units\n"); mdio->md_file = NULL; mdio->md_options = 0; mdio->md_mediasize = 0; error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); /* Ignore errors. We don't care. */ error = ERANGE; goto out; } root_mount_mddev = mdio->md_unit; printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file); error = kern_close(td, fd); out: free(mdio, M_TEMP); return (error); } static int parse_dir_onfail(char **conf) { char *action; int error; error = parse_token(conf, &action); if (error) return (error); if (!strcmp(action, "continue")) root_mount_onfail = A_CONTINUE; else if (!strcmp(action, "panic")) root_mount_onfail = A_PANIC; else if (!strcmp(action, "reboot")) root_mount_onfail = A_REBOOT; else if (!strcmp(action, "retry")) root_mount_onfail = A_RETRY; else { printf("rootmount: %s: unknown action\n", action); error = EINVAL; } free(action, M_TEMP); return (0); } static int parse_dir_timeout(char **conf) { char *tok, *endtok; long secs; int error; error = parse_token(conf, &tok); if (error) return (error); secs = strtol(tok, &endtok, 0); error = (secs < 0 || *endtok != '\0') ? EINVAL : 0; if (!error) root_mount_timeout = secs; free(tok, M_TEMP); return (error); } static int parse_directive(char **conf) { char *dir; int error; error = parse_token(conf, &dir); if (error) return (error); if (strcmp(dir, ".ask") == 0) error = parse_dir_ask(conf); else if (strcmp(dir, ".md") == 0) error = parse_dir_md(conf); else if (strcmp(dir, ".onfail") == 0) error = parse_dir_onfail(conf); else if (strcmp(dir, ".timeout") == 0) error = parse_dir_timeout(conf); else { printf("mountroot: invalid directive `%s'\n", dir); /* Ignore the rest of the line. */ (void)parse_skipto(conf, '\n'); error = EINVAL; } free(dir, M_TEMP); return (error); } static int parse_mount_dev_present(const char *dev) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread); error = namei(&nd); if (!error) vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error != 0) ? 0 : 1; } #define ERRMSGL 255 static int parse_mount(char **conf) { char *errmsg; struct mntarg *ma; char *dev, *fs, *opts, *tok; int delay, error, timeout; error = parse_token(conf, &tok); if (error) return (error); fs = tok; error = parse_skipto(&tok, ':'); if (error) { free(fs, M_TEMP); return (error); } parse_poke(&tok, '\0'); parse_advance(&tok); dev = tok; if (root_mount_mddev != -1) { /* Handle substitution for the md unit number. */ tok = strstr(dev, "md#"); if (tok != NULL) tok[2] = '0' + root_mount_mddev; } /* Parse options. */ error = parse_token(conf, &tok); opts = (error == 0) ? tok : NULL; printf("Trying to mount root from %s:%s [%s]...\n", fs, dev, (opts != NULL) ? opts : ""); errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO); if (vfs_byname(fs) == NULL) { strlcpy(errmsg, "unknown file system", ERRMSGL); error = ENOENT; goto out; } error = vfs_mountroot_wait_if_neccessary(fs, dev); if (error != 0) goto out; delay = hz / 10; timeout = root_mount_timeout * hz; for (;;) { ma = NULL; ma = mount_arg(ma, "fstype", fs, -1); ma = mount_arg(ma, "fspath", "/", -1); ma = mount_arg(ma, "from", dev, -1); ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL); ma = mount_arg(ma, "ro", NULL, 0); ma = parse_mountroot_options(ma, opts); error = kernel_mount(ma, MNT_ROOTFS); if (error == 0 || timeout <= 0) break; if (root_mount_timeout * hz == timeout || (bootverbose && timeout % hz == 0)) { printf("Mounting from %s:%s failed with error %d; " "retrying for %d more second%s\n", fs, dev, error, timeout / hz, (timeout / hz > 1) ? "s" : ""); } pause("rmretry", delay); timeout -= delay; } out: if (error) { printf("Mounting from %s:%s failed with error %d", fs, dev, error); if (errmsg[0] != '\0') printf(": %s", errmsg); printf(".\n"); } free(fs, M_TEMP); free(errmsg, M_TEMP); if (opts != NULL) free(opts, M_TEMP); /* kernel_mount can return -1 on error. */ return ((error < 0) ? EDOOFUS : error); } #undef ERRMSGL static int vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs) { struct mount *mp; char *conf; int error; root_mount_mddev = -1; retry: conf = sbuf_data(sb); mp = TAILQ_NEXT(mpdevfs, mnt_list); error = (mp == NULL) ? 0 : EDOOFUS; root_mount_onfail = A_CONTINUE; while (mp == NULL) { error = parse_skipto(&conf, CC_NONWHITESPACE); if (error == PE_EOL) { parse_advance(&conf); continue; } if (error < 0) break; switch (parse_peek(&conf)) { case '#': error = parse_skipto(&conf, '\n'); break; case '.': error = parse_directive(&conf); break; default: error = parse_mount(&conf); if (error == -1) { printf("mountroot: invalid file system " "specification.\n"); error = 0; } break; } if (error < 0) break; /* Ignore any trailing garbage on the line. */ if (parse_peek(&conf) != '\n') { printf("mountroot: advancing to next directive...\n"); (void)parse_skipto(&conf, '\n'); } mp = TAILQ_NEXT(mpdevfs, mnt_list); } if (mp != NULL) return (0); /* * We failed to mount (a new) root. */ switch (root_mount_onfail) { case A_CONTINUE: break; case A_PANIC: panic("mountroot: unable to (re-)mount root."); /* NOTREACHED */ case A_RETRY: goto retry; case A_REBOOT: kern_reboot(RB_NOSYNC); /* NOTREACHED */ } return (error); } static void vfs_mountroot_conf0(struct sbuf *sb) { char *s, *tok, *mnt, *opt; int error; sbuf_printf(sb, ".onfail panic\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); if (boothowto & RB_ASKNAME) sbuf_printf(sb, ".ask\n"); #ifdef ROOTDEVNAME if (boothowto & RB_DFLTROOT) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (boothowto & RB_CDROM) { sbuf_printf(sb, "cd9660:/dev/cd0 ro\n"); sbuf_printf(sb, ".timeout 0\n"); sbuf_printf(sb, "cd9660:/dev/cd1 ro\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); } s = kern_getenv("vfs.root.mountfrom"); if (s != NULL) { opt = kern_getenv("vfs.root.mountfrom.options"); tok = s; error = parse_token(&tok, &mnt); while (!error) { sbuf_printf(sb, "%s %s\n", mnt, (opt != NULL) ? opt : ""); free(mnt, M_TEMP); error = parse_token(&tok, &mnt); } if (opt != NULL) freeenv(opt); freeenv(s); } if (rootdevnames[0] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[0]); if (rootdevnames[1] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[1]); #ifdef ROOTDEVNAME if (!(boothowto & RB_DFLTROOT)) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (!(boothowto & RB_ASKNAME)) sbuf_printf(sb, ".ask\n"); } static int vfs_mountroot_readconf(struct thread *td, struct sbuf *sb) { static char buf[128]; struct nameidata nd; off_t ofs; ssize_t resid; int error, flags, len; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); ofs = 0; len = sizeof(buf) - 1; while (1) { error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) break; if (resid == len) break; buf[len - resid] = 0; sbuf_printf(sb, "%s", buf); ofs += len - resid; } VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); return (error); } static void vfs_mountroot_wait(void) { struct root_hold_token *h; struct timeval lastfail; int curfail; TSENTER(); curfail = 0; while (1) { g_waitidle(); mtx_lock(&root_holds_mtx); if (TAILQ_EMPTY(&root_holds)) { mtx_unlock(&root_holds_mtx); break; } if (ppsratecheck(&lastfail, &curfail, 1)) { printf("Root mount waiting for:"); TAILQ_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); } TSWAIT("root mount"); msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold", hz); TSUNWAIT("root mount"); } TSEXIT(); } static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev) { int delay, timeout; /* * In case of ZFS and NFS we don't have a way to wait for * specific device. Also do the wait if the user forced that * behaviour by setting vfs.root_mount_always_wait=1. */ if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL || dev[0] == '\0' || root_mount_always_wait != 0) { vfs_mountroot_wait(); return (0); } /* * Otherwise, no point in waiting if the device is already there. * Note that we must wait for GEOM to finish reconfiguring itself, * eg for geom_part(4) to finish tasting. */ g_waitidle(); if (parse_mount_dev_present(dev)) return (0); /* * No luck. Let's wait. This code looks weird, but it's that way * to behave exactly as it used to work before. */ vfs_mountroot_wait(); printf("mountroot: waiting for device %s...\n", dev); delay = hz / 10; timeout = root_mount_timeout * hz; do { pause("rmdev", delay); timeout -= delay; } while (timeout > 0 && !parse_mount_dev_present(dev)); if (timeout <= 0) return (ENODEV); return (0); } void vfs_mountroot(void) { struct mount *mp; struct sbuf *sb; struct thread *td; time_t timebase; int error; mtx_assert(&Giant, MA_NOTOWNED); TSENTER(); td = curthread; sb = sbuf_new_auto(); vfs_mountroot_conf0(sb); sbuf_finish(sb); error = vfs_mountroot_devfs(td, &mp); while (!error) { error = vfs_mountroot_parse(sb, mp); if (!error) { vfs_mountroot_shuffle(td, mp); sbuf_clear(sb); error = vfs_mountroot_readconf(td, sb); sbuf_finish(sb); } } sbuf_delete(sb); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); while (mp != NULL) { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); inittodr(timebase); /* Keep prison0's root in sync with the global rootvnode. */ mtx_lock(&prison0.pr_mtx); prison0.pr_root = rootvnode; vref(prison0.pr_root); mtx_unlock(&prison0.pr_mtx); mtx_lock(&root_holds_mtx); atomic_store_rel_int(&root_mount_complete, 1); wakeup(&root_mount_complete); mtx_unlock(&root_holds_mtx); EVENTHANDLER_INVOKE(mountroot); TSEXIT(); } static struct mntarg * parse_mountroot_options(struct mntarg *ma, const char *options) { char *p; char *name, *name_arg; char *val, *val_arg; char *opts; if (options == NULL || options[0] == '\0') return (ma); p = opts = strdup(options, M_MOUNT); if (opts == NULL) { return (ma); } while((name = strsep(&p, ",")) != NULL) { if (name[0] == '\0') break; val = strchr(name, '='); if (val != NULL) { *val = '\0'; ++val; } - if( strcmp(name, "rw") == 0 || - strcmp(name, "noro") == 0) { + if (strcmp(name, "rw") == 0 || strcmp(name, "noro") == 0) { /* * The first time we mount the root file system, * we need to mount 'ro', so We need to ignore * 'rw' and 'noro' mount options. */ continue; } name_arg = strdup(name, M_MOUNT); val_arg = NULL; if (val != NULL) val_arg = strdup(val, M_MOUNT); ma = mount_arg(ma, name_arg, val_arg, (val_arg != NULL ? -1 : 0)); } free(opts, M_MOUNT); return (ma); } diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h index 48c7101aad97..9622503e1f96 100644 --- a/sys/sys/buf_ring.h +++ b/sys/sys/buf_ring.h @@ -1,370 +1,370 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007-2009 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef _SYS_BUF_RING_H_ #define _SYS_BUF_RING_H_ #include #ifdef DEBUG_BUFRING #include #include #endif struct buf_ring { volatile uint32_t br_prod_head; volatile uint32_t br_prod_tail; int br_prod_size; int br_prod_mask; uint64_t br_drops; volatile uint32_t br_cons_head __aligned(CACHE_LINE_SIZE); volatile uint32_t br_cons_tail; int br_cons_size; int br_cons_mask; #ifdef DEBUG_BUFRING struct mtx *br_lock; #endif void *br_ring[0] __aligned(CACHE_LINE_SIZE); }; /* * multi-producer safe lock-free ring buffer enqueue * */ static __inline int buf_ring_enqueue(struct buf_ring *br, void *buf) { uint32_t prod_head, prod_next, cons_tail; #ifdef DEBUG_BUFRING int i; /* * Note: It is possible to encounter an mbuf that was removed * via drbr_peek(), and then re-added via drbr_putback() and * trigger a spurious panic. */ for (i = br->br_cons_head; i != br->br_prod_head; i = ((i + 1) & br->br_cons_mask)) - if(br->br_ring[i] == buf) + if (br->br_ring[i] == buf) panic("buf=%p already enqueue at %d prod=%d cons=%d", buf, i, br->br_prod_tail, br->br_cons_tail); #endif critical_enter(); do { prod_head = br->br_prod_head; prod_next = (prod_head + 1) & br->br_prod_mask; cons_tail = br->br_cons_tail; if (prod_next == cons_tail) { rmb(); if (prod_head == br->br_prod_head && cons_tail == br->br_cons_tail) { br->br_drops++; critical_exit(); return (ENOBUFS); } continue; } } while (!atomic_cmpset_acq_int(&br->br_prod_head, prod_head, prod_next)); #ifdef DEBUG_BUFRING if (br->br_ring[prod_head] != NULL) panic("dangling value in enqueue"); #endif br->br_ring[prod_head] = buf; /* * If there are other enqueues in progress * that preceded us, we need to wait for them * to complete */ while (br->br_prod_tail != prod_head) cpu_spinwait(); atomic_store_rel_int(&br->br_prod_tail, prod_next); critical_exit(); return (0); } /* * multi-consumer safe dequeue * */ static __inline void * buf_ring_dequeue_mc(struct buf_ring *br) { uint32_t cons_head, cons_next; void *buf; critical_enter(); do { cons_head = br->br_cons_head; cons_next = (cons_head + 1) & br->br_cons_mask; if (cons_head == br->br_prod_tail) { critical_exit(); return (NULL); } } while (!atomic_cmpset_acq_int(&br->br_cons_head, cons_head, cons_next)); buf = br->br_ring[cons_head]; #ifdef DEBUG_BUFRING br->br_ring[cons_head] = NULL; #endif /* * If there are other dequeues in progress * that preceded us, we need to wait for them * to complete */ while (br->br_cons_tail != cons_head) cpu_spinwait(); atomic_store_rel_int(&br->br_cons_tail, cons_next); critical_exit(); return (buf); } /* * single-consumer dequeue * use where dequeue is protected by a lock * e.g. a network driver's tx queue lock */ static __inline void * buf_ring_dequeue_sc(struct buf_ring *br) { uint32_t cons_head, cons_next; #ifdef PREFETCH_DEFINED uint32_t cons_next_next; #endif uint32_t prod_tail; void *buf; /* * This is a workaround to allow using buf_ring on ARM and ARM64. * ARM64TODO: Fix buf_ring in a generic way. * REMARKS: It is suspected that br_cons_head does not require * load_acq operation, but this change was extensively tested * and confirmed it's working. To be reviewed once again in * FreeBSD-12. * * Preventing following situation: * Core(0) - buf_ring_enqueue() Core(1) - buf_ring_dequeue_sc() * ----------------------------------------- ---------------------------------------------- * * cons_head = br->br_cons_head; * atomic_cmpset_acq_32(&br->br_prod_head, ...)); * buf = br->br_ring[cons_head]; > * br->br_ring[prod_head] = buf; * atomic_store_rel_32(&br->br_prod_tail, ...); * prod_tail = br->br_prod_tail; * if (cons_head == prod_tail) * return (NULL); * ` * * <1> Load (on core 1) from br->br_ring[cons_head] can be reordered (speculative readed) by CPU. */ #if defined(__arm__) || defined(__aarch64__) cons_head = atomic_load_acq_32(&br->br_cons_head); #else cons_head = br->br_cons_head; #endif prod_tail = atomic_load_acq_32(&br->br_prod_tail); cons_next = (cons_head + 1) & br->br_cons_mask; #ifdef PREFETCH_DEFINED cons_next_next = (cons_head + 2) & br->br_cons_mask; #endif if (cons_head == prod_tail) return (NULL); #ifdef PREFETCH_DEFINED if (cons_next != prod_tail) { prefetch(br->br_ring[cons_next]); if (cons_next_next != prod_tail) prefetch(br->br_ring[cons_next_next]); } #endif br->br_cons_head = cons_next; buf = br->br_ring[cons_head]; #ifdef DEBUG_BUFRING br->br_ring[cons_head] = NULL; if (!mtx_owned(br->br_lock)) panic("lock not held on single consumer dequeue"); if (br->br_cons_tail != cons_head) panic("inconsistent list cons_tail=%d cons_head=%d", br->br_cons_tail, cons_head); #endif br->br_cons_tail = cons_next; return (buf); } /* * single-consumer advance after a peek * use where it is protected by a lock * e.g. a network driver's tx queue lock */ static __inline void buf_ring_advance_sc(struct buf_ring *br) { uint32_t cons_head, cons_next; uint32_t prod_tail; cons_head = br->br_cons_head; prod_tail = br->br_prod_tail; cons_next = (cons_head + 1) & br->br_cons_mask; if (cons_head == prod_tail) return; br->br_cons_head = cons_next; #ifdef DEBUG_BUFRING br->br_ring[cons_head] = NULL; #endif br->br_cons_tail = cons_next; } /* * Used to return a buffer (most likely already there) * to the top of the ring. The caller should *not* * have used any dequeue to pull it out of the ring * but instead should have used the peek() function. * This is normally used where the transmit queue * of a driver is full, and an mbuf must be returned. * Most likely whats in the ring-buffer is what * is being put back (since it was not removed), but * sometimes the lower transmit function may have * done a pullup or other function that will have * changed it. As an optimization we always put it * back (since jhb says the store is probably cheaper), * if we have to do a multi-queue version we will need * the compare and an atomic. */ static __inline void buf_ring_putback_sc(struct buf_ring *br, void *new) { KASSERT(br->br_cons_head != br->br_prod_tail, ("Buf-Ring has none in putback")) ; br->br_ring[br->br_cons_head] = new; } /* * return a pointer to the first entry in the ring * without modifying it, or NULL if the ring is empty * race-prone if not protected by a lock */ static __inline void * buf_ring_peek(struct buf_ring *br) { #ifdef DEBUG_BUFRING if ((br->br_lock != NULL) && !mtx_owned(br->br_lock)) panic("lock not held on single consumer dequeue"); #endif /* * I believe it is safe to not have a memory barrier * here because we control cons and tail is worst case * a lagging indicator so we worst case we might * return NULL immediately after a buffer has been enqueued */ if (br->br_cons_head == br->br_prod_tail) return (NULL); return (br->br_ring[br->br_cons_head]); } static __inline void * buf_ring_peek_clear_sc(struct buf_ring *br) { #ifdef DEBUG_BUFRING void *ret; if (!mtx_owned(br->br_lock)) panic("lock not held on single consumer dequeue"); #endif if (br->br_cons_head == br->br_prod_tail) return (NULL); #if defined(__arm__) || defined(__aarch64__) /* * The barrier is required there on ARM and ARM64 to ensure, that * br->br_ring[br->br_cons_head] will not be fetched before the above * condition is checked. * Without the barrier, it is possible, that buffer will be fetched * before the enqueue will put mbuf into br, then, in the meantime, the * enqueue will update the array and the br_prod_tail, and the * conditional check will be true, so we will return previously fetched * (and invalid) buffer. */ atomic_thread_fence_acq(); #endif #ifdef DEBUG_BUFRING /* * Single consumer, i.e. cons_head will not move while we are * running, so atomic_swap_ptr() is not necessary here. */ ret = br->br_ring[br->br_cons_head]; br->br_ring[br->br_cons_head] = NULL; return (ret); #else return (br->br_ring[br->br_cons_head]); #endif } static __inline int buf_ring_full(struct buf_ring *br) { return (((br->br_prod_head + 1) & br->br_prod_mask) == br->br_cons_tail); } static __inline int buf_ring_empty(struct buf_ring *br) { return (br->br_cons_head == br->br_prod_tail); } static __inline int buf_ring_count(struct buf_ring *br) { return ((br->br_prod_size + br->br_prod_tail - br->br_cons_tail) & br->br_prod_mask); } struct buf_ring *buf_ring_alloc(int count, struct malloc_type *type, int flags, struct mtx *); void buf_ring_free(struct buf_ring *br, struct malloc_type *type); #endif diff --git a/sys/sys/efi.h b/sys/sys/efi.h index 5875e87b3595..0c0b52afc81d 100644 --- a/sys/sys/efi.h +++ b/sys/sys/efi.h @@ -1,282 +1,282 @@ /*- * Copyright (c) 2004 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_EFI_H_ #define _SYS_EFI_H_ #include #include #define EFI_PAGE_SHIFT 12 #define EFI_PAGE_SIZE (1 << EFI_PAGE_SHIFT) #define EFI_PAGE_MASK (EFI_PAGE_SIZE - 1) #define EFI_TABLE_SMBIOS \ {0xeb9d2d31,0x2d88,0x11d3,0x9a,0x16,{0x00,0x90,0x27,0x3f,0xc1,0x4d}} #define EFI_TABLE_SMBIOS3 \ {0xf2fd1544,0x9794,0x4a2c,0x99,0x2e,{0xe5,0xbb,0xcf,0x20,0xe3,0x94}} enum efi_reset { EFI_RESET_COLD = 0, EFI_RESET_WARM = 1, EFI_RESET_SHUTDOWN = 2, }; typedef uint16_t efi_char; typedef unsigned long efi_status; struct efi_cfgtbl { struct uuid ct_uuid; void *ct_data; }; struct efi_md { uint32_t md_type; #define EFI_MD_TYPE_NULL 0 #define EFI_MD_TYPE_CODE 1 /* Loader text. */ #define EFI_MD_TYPE_DATA 2 /* Loader data. */ #define EFI_MD_TYPE_BS_CODE 3 /* Boot services text. */ #define EFI_MD_TYPE_BS_DATA 4 /* Boot services data. */ #define EFI_MD_TYPE_RT_CODE 5 /* Runtime services text. */ #define EFI_MD_TYPE_RT_DATA 6 /* Runtime services data. */ #define EFI_MD_TYPE_FREE 7 /* Unused/free memory. */ #define EFI_MD_TYPE_BAD 8 /* Bad memory */ #define EFI_MD_TYPE_RECLAIM 9 /* ACPI reclaimable memory. */ #define EFI_MD_TYPE_FIRMWARE 10 /* ACPI NV memory */ #define EFI_MD_TYPE_IOMEM 11 /* Memory-mapped I/O. */ #define EFI_MD_TYPE_IOPORT 12 /* I/O port space. */ #define EFI_MD_TYPE_PALCODE 13 /* PAL */ #define EFI_MD_TYPE_PERSISTENT 14 /* Persistent memory. */ uint32_t __pad; uint64_t md_phys; void *md_virt; uint64_t md_pages; uint64_t md_attr; #define EFI_MD_ATTR_UC 0x0000000000000001UL #define EFI_MD_ATTR_WC 0x0000000000000002UL #define EFI_MD_ATTR_WT 0x0000000000000004UL #define EFI_MD_ATTR_WB 0x0000000000000008UL #define EFI_MD_ATTR_UCE 0x0000000000000010UL #define EFI_MD_ATTR_WP 0x0000000000001000UL #define EFI_MD_ATTR_RP 0x0000000000002000UL #define EFI_MD_ATTR_XP 0x0000000000004000UL #define EFI_MD_ATTR_NV 0x0000000000008000UL #define EFI_MD_ATTR_MORE_RELIABLE \ 0x0000000000010000UL #define EFI_MD_ATTR_RO 0x0000000000020000UL #define EFI_MD_ATTR_RT 0x8000000000000000UL }; #define efi_next_descriptor(ptr, size) \ ((struct efi_md *)(((uint8_t *)(ptr)) + (size))) struct efi_tm { uint16_t tm_year; /* 1998 - 20XX */ uint8_t tm_mon; /* 1 - 12 */ uint8_t tm_mday; /* 1 - 31 */ uint8_t tm_hour; /* 0 - 23 */ uint8_t tm_min; /* 0 - 59 */ uint8_t tm_sec; /* 0 - 59 */ uint8_t __pad1; uint32_t tm_nsec; /* 0 - 999,999,999 */ int16_t tm_tz; /* -1440 to 1440 or 2047 */ uint8_t tm_dst; uint8_t __pad2; }; struct efi_tmcap { uint32_t tc_res; /* 1e-6 parts per million */ uint32_t tc_prec; /* hertz */ uint8_t tc_stz; /* Set clears sub-second time */ }; struct efi_tblhdr { uint64_t th_sig; uint32_t th_rev; uint32_t th_hdrsz; uint32_t th_crc32; uint32_t __res; }; #ifdef _KERNEL #ifdef EFIABI_ATTR struct efi_rt { struct efi_tblhdr rt_hdr; efi_status (*rt_gettime)(struct efi_tm *, struct efi_tmcap *) EFIABI_ATTR; efi_status (*rt_settime)(struct efi_tm *) EFIABI_ATTR; efi_status (*rt_getwaketime)(uint8_t *, uint8_t *, struct efi_tm *) EFIABI_ATTR; efi_status (*rt_setwaketime)(uint8_t, struct efi_tm *) EFIABI_ATTR; efi_status (*rt_setvirtual)(u_long, u_long, uint32_t, struct efi_md *) EFIABI_ATTR; efi_status (*rt_cvtptr)(u_long, void **) EFIABI_ATTR; efi_status (*rt_getvar)(efi_char *, struct uuid *, uint32_t *, u_long *, void *) EFIABI_ATTR; efi_status (*rt_scanvar)(u_long *, efi_char *, struct uuid *) EFIABI_ATTR; efi_status (*rt_setvar)(efi_char *, struct uuid *, uint32_t, u_long, void *) EFIABI_ATTR; efi_status (*rt_gethicnt)(uint32_t *) EFIABI_ATTR; efi_status (*rt_reset)(enum efi_reset, efi_status, u_long, efi_char *) EFIABI_ATTR; }; #endif struct efi_systbl { struct efi_tblhdr st_hdr; #define EFI_SYSTBL_SIG 0x5453595320494249UL efi_char *st_fwvendor; uint32_t st_fwrev; uint32_t __pad; void *st_cin; void *st_cinif; void *st_cout; void *st_coutif; void *st_cerr; void *st_cerrif; uint64_t st_rt; void *st_bs; u_long st_entries; uint64_t st_cfgtbl; }; extern vm_paddr_t efi_systbl_phys; struct efirt_callinfo; /* Internal MD EFI functions */ int efi_arch_enter(void); void efi_arch_leave(void); vm_offset_t efi_phys_to_kva(vm_paddr_t); int efi_rt_arch_call(struct efirt_callinfo *); bool efi_create_1t1_map(struct efi_md *, int, int); void efi_destroy_1t1_map(void); struct efi_ops { /* * The EFI calls might be virtualized in some environments, requiring * FreeBSD to use a different interface (ie: hypercalls) in order to * access them. */ int (*rt_ok)(void); int (*get_table)(struct uuid *, void **); int (*get_time)(struct efi_tm *); int (*get_time_capabilities)(struct efi_tmcap *); int (*reset_system)(enum efi_reset); int (*set_time)(struct efi_tm *); int (*var_get)(uint16_t *, struct uuid *, uint32_t *, size_t *, void *); int (*var_nextname)(size_t *, uint16_t *, struct uuid *); int (*var_set)(uint16_t *, struct uuid *, uint32_t, size_t, void *); }; extern const struct efi_ops *active_efi_ops; /* Public MI EFI functions */ static inline int efi_rt_ok(void) { - if(active_efi_ops->rt_ok == NULL) + if (active_efi_ops->rt_ok == NULL) return (ENXIO); return (active_efi_ops->rt_ok()); } static inline int efi_get_table(struct uuid *uuid, void **ptr) { if (active_efi_ops->get_table == NULL) return (ENXIO); return (active_efi_ops->get_table(uuid, ptr)); } static inline int efi_get_time(struct efi_tm *tm) { if (active_efi_ops->get_time == NULL) return (ENXIO); return (active_efi_ops->get_time(tm)); } static inline int efi_get_time_capabilities(struct efi_tmcap *tmcap) { if (active_efi_ops->get_time_capabilities == NULL) return (ENXIO); return (active_efi_ops->get_time_capabilities(tmcap)); } static inline int efi_reset_system(enum efi_reset type) { if (active_efi_ops->reset_system == NULL) return (ENXIO); return (active_efi_ops->reset_system(type)); } static inline int efi_set_time(struct efi_tm *tm) { if (active_efi_ops->set_time == NULL) return (ENXIO); return (active_efi_ops->set_time(tm)); } static inline int efi_var_get(uint16_t *name, struct uuid *vendor, uint32_t *attrib, size_t *datasize, void *data) { if (active_efi_ops->var_get == NULL) return (ENXIO); return (active_efi_ops->var_get(name, vendor, attrib, datasize, data)); } static inline int efi_var_nextname(size_t *namesize, uint16_t *name, struct uuid *vendor) { if (active_efi_ops->var_nextname == NULL) return (ENXIO); return (active_efi_ops->var_nextname(namesize, name, vendor)); } static inline int efi_var_set(uint16_t *name, struct uuid *vendor, uint32_t attrib, size_t datasize, void *data) { if (active_efi_ops->var_set == NULL) return (ENXIO); return (active_efi_ops->var_set(name, vendor, attrib, datasize, data)); } int efi_status_to_errno(efi_status status); #endif /* _KERNEL */ #endif /* _SYS_EFI_H_ */ diff --git a/sys/sys/tree.h b/sys/sys/tree.h index eb5f244d8a12..bc01e4de910a 100644 --- a/sys/sys/tree.h +++ b/sys/sys/tree.h @@ -1,833 +1,833 @@ /* $NetBSD: tree.h,v 1.8 2004/03/28 19:38:30 provos Exp $ */ /* $OpenBSD: tree.h,v 1.7 2002/10/17 21:51:54 art Exp $ */ /* $FreeBSD$ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright 2002 Niels Provos * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_TREE_H_ #define _SYS_TREE_H_ #include /* * This file defines data structures for different types of trees: * splay trees and rank-balanced trees. * * A splay tree is a self-organizing data structure. Every operation * on the tree causes a splay to happen. The splay moves the requested * node to the root of the tree and partly rebalances it. * * This has the benefit that request locality causes faster lookups as * the requested nodes move to the top of the tree. On the other hand, * every lookup causes memory writes. * * The Balance Theorem bounds the total access time for m operations * and n inserts on an initially empty tree as O((m + n)lg n). The * amortized cost for a sequence of m accesses to a splay tree is O(lg n); * * A rank-balanced tree is a binary search tree with an integer * rank-difference as an attribute of each pointer from parent to child. * The sum of the rank-differences on any path from a node down to null is * the same, and defines the rank of that node. The rank of the null node * is -1. * * Different additional conditions define different sorts of balanced * trees, including "red-black" and "AVL" trees. The set of conditions * applied here are the "weak-AVL" conditions of Haeupler, Sen and Tarjan: * - every rank-difference is 1 or 2. * - the rank of any leaf is 1. * * For historical reasons, rank differences that are even are associated * with the color red (Rank-Even-Difference), and the child that a red edge * points to is called a red child. * * Every operation on a rank-balanced tree is bounded as O(lg n). * The maximum height of a rank-balanced tree is 2lg (n+1). */ #define SPLAY_HEAD(name, type) \ struct name { \ struct type *sph_root; /* root of the tree */ \ } #define SPLAY_INITIALIZER(root) \ { NULL } #define SPLAY_INIT(root) do { \ (root)->sph_root = NULL; \ } while (/*CONSTCOND*/ 0) #define SPLAY_ENTRY(type) \ struct { \ struct type *spe_left; /* left element */ \ struct type *spe_right; /* right element */ \ } #define SPLAY_LEFT(elm, field) (elm)->field.spe_left #define SPLAY_RIGHT(elm, field) (elm)->field.spe_right #define SPLAY_ROOT(head) (head)->sph_root #define SPLAY_EMPTY(head) (SPLAY_ROOT(head) == NULL) /* SPLAY_ROTATE_{LEFT,RIGHT} expect that tmp hold SPLAY_{RIGHT,LEFT} */ #define SPLAY_ROTATE_RIGHT(head, tmp, field) do { \ SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(tmp, field); \ SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ (head)->sph_root = tmp; \ } while (/*CONSTCOND*/ 0) #define SPLAY_ROTATE_LEFT(head, tmp, field) do { \ SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(tmp, field); \ SPLAY_LEFT(tmp, field) = (head)->sph_root; \ (head)->sph_root = tmp; \ } while (/*CONSTCOND*/ 0) #define SPLAY_LINKLEFT(head, tmp, field) do { \ SPLAY_LEFT(tmp, field) = (head)->sph_root; \ tmp = (head)->sph_root; \ (head)->sph_root = SPLAY_LEFT((head)->sph_root, field); \ } while (/*CONSTCOND*/ 0) #define SPLAY_LINKRIGHT(head, tmp, field) do { \ SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ tmp = (head)->sph_root; \ (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field); \ } while (/*CONSTCOND*/ 0) #define SPLAY_ASSEMBLE(head, node, left, right, field) do { \ SPLAY_RIGHT(left, field) = SPLAY_LEFT((head)->sph_root, field); \ SPLAY_LEFT(right, field) = SPLAY_RIGHT((head)->sph_root, field);\ SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(node, field); \ SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(node, field); \ } while (/*CONSTCOND*/ 0) /* Generates prototypes and inline functions */ #define SPLAY_PROTOTYPE(name, type, field, cmp) \ void name##_SPLAY(struct name *, struct type *); \ void name##_SPLAY_MINMAX(struct name *, int); \ struct type *name##_SPLAY_INSERT(struct name *, struct type *); \ struct type *name##_SPLAY_REMOVE(struct name *, struct type *); \ \ /* Finds the node with the same key as elm */ \ static __unused __inline struct type * \ name##_SPLAY_FIND(struct name *head, struct type *elm) \ { \ if (SPLAY_EMPTY(head)) \ return(NULL); \ name##_SPLAY(head, elm); \ if ((cmp)(elm, (head)->sph_root) == 0) \ return (head->sph_root); \ return (NULL); \ } \ \ static __unused __inline struct type * \ name##_SPLAY_NEXT(struct name *head, struct type *elm) \ { \ name##_SPLAY(head, elm); \ if (SPLAY_RIGHT(elm, field) != NULL) { \ elm = SPLAY_RIGHT(elm, field); \ while (SPLAY_LEFT(elm, field) != NULL) { \ elm = SPLAY_LEFT(elm, field); \ } \ } else \ elm = NULL; \ return (elm); \ } \ \ static __unused __inline struct type * \ name##_SPLAY_MIN_MAX(struct name *head, int val) \ { \ name##_SPLAY_MINMAX(head, val); \ return (SPLAY_ROOT(head)); \ } /* Main splay operation. * Moves node close to the key of elm to top */ #define SPLAY_GENERATE(name, type, field, cmp) \ struct type * \ name##_SPLAY_INSERT(struct name *head, struct type *elm) \ { \ if (SPLAY_EMPTY(head)) { \ SPLAY_LEFT(elm, field) = SPLAY_RIGHT(elm, field) = NULL; \ } else { \ int __comp; \ name##_SPLAY(head, elm); \ __comp = (cmp)(elm, (head)->sph_root); \ - if(__comp < 0) { \ + if (__comp < 0) { \ SPLAY_LEFT(elm, field) = SPLAY_LEFT((head)->sph_root, field);\ SPLAY_RIGHT(elm, field) = (head)->sph_root; \ SPLAY_LEFT((head)->sph_root, field) = NULL; \ } else if (__comp > 0) { \ SPLAY_RIGHT(elm, field) = SPLAY_RIGHT((head)->sph_root, field);\ SPLAY_LEFT(elm, field) = (head)->sph_root; \ SPLAY_RIGHT((head)->sph_root, field) = NULL; \ } else \ return ((head)->sph_root); \ } \ (head)->sph_root = (elm); \ return (NULL); \ } \ \ struct type * \ name##_SPLAY_REMOVE(struct name *head, struct type *elm) \ { \ struct type *__tmp; \ if (SPLAY_EMPTY(head)) \ return (NULL); \ name##_SPLAY(head, elm); \ if ((cmp)(elm, (head)->sph_root) == 0) { \ if (SPLAY_LEFT((head)->sph_root, field) == NULL) { \ (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);\ } else { \ __tmp = SPLAY_RIGHT((head)->sph_root, field); \ (head)->sph_root = SPLAY_LEFT((head)->sph_root, field);\ name##_SPLAY(head, elm); \ SPLAY_RIGHT((head)->sph_root, field) = __tmp; \ } \ return (elm); \ } \ return (NULL); \ } \ \ void \ name##_SPLAY(struct name *head, struct type *elm) \ { \ struct type __node, *__left, *__right, *__tmp; \ int __comp; \ \ SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ __left = __right = &__node; \ \ while ((__comp = (cmp)(elm, (head)->sph_root)) != 0) { \ if (__comp < 0) { \ __tmp = SPLAY_LEFT((head)->sph_root, field); \ if (__tmp == NULL) \ break; \ if ((cmp)(elm, __tmp) < 0){ \ SPLAY_ROTATE_RIGHT(head, __tmp, field); \ if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ break; \ } \ SPLAY_LINKLEFT(head, __right, field); \ } else if (__comp > 0) { \ __tmp = SPLAY_RIGHT((head)->sph_root, field); \ if (__tmp == NULL) \ break; \ if ((cmp)(elm, __tmp) > 0){ \ SPLAY_ROTATE_LEFT(head, __tmp, field); \ if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ break; \ } \ SPLAY_LINKRIGHT(head, __left, field); \ } \ } \ SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ } \ \ /* Splay with either the minimum or the maximum element \ * Used to find minimum or maximum element in tree. \ */ \ void name##_SPLAY_MINMAX(struct name *head, int __comp) \ { \ struct type __node, *__left, *__right, *__tmp; \ \ SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ __left = __right = &__node; \ \ while (1) { \ if (__comp < 0) { \ __tmp = SPLAY_LEFT((head)->sph_root, field); \ if (__tmp == NULL) \ break; \ if (__comp < 0){ \ SPLAY_ROTATE_RIGHT(head, __tmp, field); \ if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ break; \ } \ SPLAY_LINKLEFT(head, __right, field); \ } else if (__comp > 0) { \ __tmp = SPLAY_RIGHT((head)->sph_root, field); \ if (__tmp == NULL) \ break; \ if (__comp > 0) { \ SPLAY_ROTATE_LEFT(head, __tmp, field); \ if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ break; \ } \ SPLAY_LINKRIGHT(head, __left, field); \ } \ } \ SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ } #define SPLAY_NEGINF -1 #define SPLAY_INF 1 #define SPLAY_INSERT(name, x, y) name##_SPLAY_INSERT(x, y) #define SPLAY_REMOVE(name, x, y) name##_SPLAY_REMOVE(x, y) #define SPLAY_FIND(name, x, y) name##_SPLAY_FIND(x, y) #define SPLAY_NEXT(name, x, y) name##_SPLAY_NEXT(x, y) #define SPLAY_MIN(name, x) (SPLAY_EMPTY(x) ? NULL \ : name##_SPLAY_MIN_MAX(x, SPLAY_NEGINF)) #define SPLAY_MAX(name, x) (SPLAY_EMPTY(x) ? NULL \ : name##_SPLAY_MIN_MAX(x, SPLAY_INF)) #define SPLAY_FOREACH(x, name, head) \ for ((x) = SPLAY_MIN(name, head); \ (x) != NULL; \ (x) = SPLAY_NEXT(name, head, x)) /* Macros that define a rank-balanced tree */ #define RB_HEAD(name, type) \ struct name { \ struct type *rbh_root; /* root of the tree */ \ } #define RB_INITIALIZER(root) \ { NULL } #define RB_INIT(root) do { \ (root)->rbh_root = NULL; \ } while (/*CONSTCOND*/ 0) #define RB_ENTRY(type) \ struct { \ struct type *rbe_left; /* left element */ \ struct type *rbe_right; /* right element */ \ struct type *rbe_parent; /* parent element */ \ } #define RB_LEFT(elm, field) (elm)->field.rbe_left #define RB_RIGHT(elm, field) (elm)->field.rbe_right /* * With the expectation that any object of struct type has an * address that is a multiple of 4, and that therefore the * 2 least significant bits of a pointer to struct type are * always zero, this implementation sets those bits to indicate * that the left or right child of the tree node is "red". */ #define RB_UP(elm, field) (elm)->field.rbe_parent #define RB_BITS(elm, field) (*(__uintptr_t *)&RB_UP(elm, field)) #define RB_RED_L ((__uintptr_t)1) #define RB_RED_R ((__uintptr_t)2) #define RB_RED_MASK ((__uintptr_t)3) #define RB_FLIP_LEFT(elm, field) (RB_BITS(elm, field) ^= RB_RED_L) #define RB_FLIP_RIGHT(elm, field) (RB_BITS(elm, field) ^= RB_RED_R) #define RB_RED_LEFT(elm, field) ((RB_BITS(elm, field) & RB_RED_L) != 0) #define RB_RED_RIGHT(elm, field) ((RB_BITS(elm, field) & RB_RED_R) != 0) #define RB_PARENT(elm, field) ((__typeof(RB_UP(elm, field))) \ (RB_BITS(elm, field) & ~RB_RED_MASK)) #define RB_ROOT(head) (head)->rbh_root #define RB_EMPTY(head) (RB_ROOT(head) == NULL) #define RB_SET_PARENT(dst, src, field) do { \ RB_BITS(dst, field) &= RB_RED_MASK; \ RB_BITS(dst, field) |= (__uintptr_t)src; \ } while (/*CONSTCOND*/ 0) #define RB_SET(elm, parent, field) do { \ RB_UP(elm, field) = parent; \ RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \ } while (/*CONSTCOND*/ 0) #define RB_COLOR(elm, field) (RB_PARENT(elm, field) == NULL ? 0 : \ RB_LEFT(RB_PARENT(elm, field), field) == elm ? \ RB_RED_LEFT(RB_PARENT(elm, field), field) : \ RB_RED_RIGHT(RB_PARENT(elm, field), field)) /* * Something to be invoked in a loop at the root of every modified subtree, * from the bottom up to the root, to update augmented node data. */ #ifndef RB_AUGMENT #define RB_AUGMENT(x) break #endif #define RB_SWAP_CHILD(head, out, in, field) do { \ if (RB_PARENT(out, field) == NULL) \ RB_ROOT(head) = (in); \ else if ((out) == RB_LEFT(RB_PARENT(out, field), field)) \ RB_LEFT(RB_PARENT(out, field), field) = (in); \ else \ RB_RIGHT(RB_PARENT(out, field), field) = (in); \ } while (/*CONSTCOND*/ 0) #define RB_ROTATE_LEFT(head, elm, tmp, field) do { \ (tmp) = RB_RIGHT(elm, field); \ if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field)) != NULL) { \ RB_SET_PARENT(RB_RIGHT(elm, field), elm, field); \ } \ RB_SET_PARENT(tmp, RB_PARENT(elm, field), field); \ RB_SWAP_CHILD(head, elm, tmp, field); \ RB_LEFT(tmp, field) = (elm); \ RB_SET_PARENT(elm, tmp, field); \ RB_AUGMENT(elm); \ } while (/*CONSTCOND*/ 0) #define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \ (tmp) = RB_LEFT(elm, field); \ if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field)) != NULL) { \ RB_SET_PARENT(RB_LEFT(elm, field), elm, field); \ } \ RB_SET_PARENT(tmp, RB_PARENT(elm, field), field); \ RB_SWAP_CHILD(head, elm, tmp, field); \ RB_RIGHT(tmp, field) = (elm); \ RB_SET_PARENT(elm, tmp, field); \ RB_AUGMENT(elm); \ } while (/*CONSTCOND*/ 0) /* Generates prototypes and inline functions */ #define RB_PROTOTYPE(name, type, field, cmp) \ RB_PROTOTYPE_INTERNAL(name, type, field, cmp,) #define RB_PROTOTYPE_STATIC(name, type, field, cmp) \ RB_PROTOTYPE_INTERNAL(name, type, field, cmp, __unused static) #define RB_PROTOTYPE_INTERNAL(name, type, field, cmp, attr) \ RB_PROTOTYPE_INSERT_COLOR(name, type, attr); \ RB_PROTOTYPE_REMOVE_COLOR(name, type, attr); \ RB_PROTOTYPE_INSERT(name, type, attr); \ RB_PROTOTYPE_REMOVE(name, type, attr); \ RB_PROTOTYPE_FIND(name, type, attr); \ RB_PROTOTYPE_NFIND(name, type, attr); \ RB_PROTOTYPE_NEXT(name, type, attr); \ RB_PROTOTYPE_PREV(name, type, attr); \ RB_PROTOTYPE_MINMAX(name, type, attr); \ RB_PROTOTYPE_REINSERT(name, type, attr); #define RB_PROTOTYPE_INSERT_COLOR(name, type, attr) \ attr void name##_RB_INSERT_COLOR(struct name *, struct type *) #define RB_PROTOTYPE_REMOVE_COLOR(name, type, attr) \ attr void name##_RB_REMOVE_COLOR(struct name *, \ struct type *, struct type *) #define RB_PROTOTYPE_REMOVE(name, type, attr) \ attr struct type *name##_RB_REMOVE(struct name *, struct type *) #define RB_PROTOTYPE_INSERT(name, type, attr) \ attr struct type *name##_RB_INSERT(struct name *, struct type *) #define RB_PROTOTYPE_FIND(name, type, attr) \ attr struct type *name##_RB_FIND(struct name *, struct type *) #define RB_PROTOTYPE_NFIND(name, type, attr) \ attr struct type *name##_RB_NFIND(struct name *, struct type *) #define RB_PROTOTYPE_NEXT(name, type, attr) \ attr struct type *name##_RB_NEXT(struct type *) #define RB_PROTOTYPE_PREV(name, type, attr) \ attr struct type *name##_RB_PREV(struct type *) #define RB_PROTOTYPE_MINMAX(name, type, attr) \ attr struct type *name##_RB_MINMAX(struct name *, int) #define RB_PROTOTYPE_REINSERT(name, type, attr) \ attr struct type *name##_RB_REINSERT(struct name *, struct type *) /* Main rb operation. * Moves node close to the key of elm to top */ #define RB_GENERATE(name, type, field, cmp) \ RB_GENERATE_INTERNAL(name, type, field, cmp,) #define RB_GENERATE_STATIC(name, type, field, cmp) \ RB_GENERATE_INTERNAL(name, type, field, cmp, __unused static) #define RB_GENERATE_INTERNAL(name, type, field, cmp, attr) \ RB_GENERATE_INSERT_COLOR(name, type, field, attr) \ RB_GENERATE_REMOVE_COLOR(name, type, field, attr) \ RB_GENERATE_INSERT(name, type, field, cmp, attr) \ RB_GENERATE_REMOVE(name, type, field, attr) \ RB_GENERATE_FIND(name, type, field, cmp, attr) \ RB_GENERATE_NFIND(name, type, field, cmp, attr) \ RB_GENERATE_NEXT(name, type, field, attr) \ RB_GENERATE_PREV(name, type, field, attr) \ RB_GENERATE_MINMAX(name, type, field, attr) \ RB_GENERATE_REINSERT(name, type, field, cmp, attr) #define RB_GENERATE_INSERT_COLOR(name, type, field, attr) \ attr void \ name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \ { \ struct type *child, *parent; \ while ((parent = RB_PARENT(elm, field)) != NULL) { \ if (RB_LEFT(parent, field) == elm) { \ if (RB_RED_LEFT(parent, field)) { \ RB_FLIP_LEFT(parent, field); \ return; \ } \ RB_FLIP_RIGHT(parent, field); \ if (RB_RED_RIGHT(parent, field)) { \ elm = parent; \ continue; \ } \ if (!RB_RED_RIGHT(elm, field)) { \ RB_FLIP_LEFT(elm, field); \ RB_ROTATE_LEFT(head, elm, child, field);\ if (RB_RED_LEFT(child, field)) \ RB_FLIP_RIGHT(elm, field); \ else if (RB_RED_RIGHT(child, field)) \ RB_FLIP_LEFT(parent, field); \ elm = child; \ } \ RB_ROTATE_RIGHT(head, parent, elm, field); \ } else { \ if (RB_RED_RIGHT(parent, field)) { \ RB_FLIP_RIGHT(parent, field); \ return; \ } \ RB_FLIP_LEFT(parent, field); \ if (RB_RED_LEFT(parent, field)) { \ elm = parent; \ continue; \ } \ if (!RB_RED_LEFT(elm, field)) { \ RB_FLIP_RIGHT(elm, field); \ RB_ROTATE_RIGHT(head, elm, child, field);\ if (RB_RED_RIGHT(child, field)) \ RB_FLIP_LEFT(elm, field); \ else if (RB_RED_LEFT(child, field)) \ RB_FLIP_RIGHT(parent, field); \ elm = child; \ } \ RB_ROTATE_LEFT(head, parent, elm, field); \ } \ RB_BITS(elm, field) &= ~RB_RED_MASK; \ break; \ } \ } #define RB_GENERATE_REMOVE_COLOR(name, type, field, attr) \ attr void \ name##_RB_REMOVE_COLOR(struct name *head, \ struct type *parent, struct type *elm) \ { \ struct type *sib; \ if (RB_LEFT(parent, field) == elm && \ RB_RIGHT(parent, field) == elm) { \ RB_BITS(parent, field) &= ~RB_RED_MASK; \ elm = parent; \ parent = RB_PARENT(elm, field); \ if (parent == NULL) \ return; \ } \ do { \ if (RB_LEFT(parent, field) == elm) { \ if (!RB_RED_LEFT(parent, field)) { \ RB_FLIP_LEFT(parent, field); \ return; \ } \ if (RB_RED_RIGHT(parent, field)) { \ RB_FLIP_RIGHT(parent, field); \ elm = parent; \ continue; \ } \ sib = RB_RIGHT(parent, field); \ if ((~RB_BITS(sib, field) & RB_RED_MASK) == 0) {\ RB_BITS(sib, field) &= ~RB_RED_MASK; \ elm = parent; \ continue; \ } \ RB_FLIP_RIGHT(sib, field); \ if (RB_RED_LEFT(sib, field)) \ RB_FLIP_LEFT(parent, field); \ else if (!RB_RED_RIGHT(sib, field)) { \ RB_FLIP_LEFT(parent, field); \ RB_ROTATE_RIGHT(head, sib, elm, field); \ if (RB_RED_RIGHT(elm, field)) \ RB_FLIP_LEFT(sib, field); \ if (RB_RED_LEFT(elm, field)) \ RB_FLIP_RIGHT(parent, field); \ RB_BITS(elm, field) |= RB_RED_MASK; \ sib = elm; \ } \ RB_ROTATE_LEFT(head, parent, sib, field); \ } else { \ if (!RB_RED_RIGHT(parent, field)) { \ RB_FLIP_RIGHT(parent, field); \ return; \ } \ if (RB_RED_LEFT(parent, field)) { \ RB_FLIP_LEFT(parent, field); \ elm = parent; \ continue; \ } \ sib = RB_LEFT(parent, field); \ if ((~RB_BITS(sib, field) & RB_RED_MASK) == 0) {\ RB_BITS(sib, field) &= ~RB_RED_MASK; \ elm = parent; \ continue; \ } \ RB_FLIP_LEFT(sib, field); \ if (RB_RED_RIGHT(sib, field)) \ RB_FLIP_RIGHT(parent, field); \ else if (!RB_RED_LEFT(sib, field)) { \ RB_FLIP_RIGHT(parent, field); \ RB_ROTATE_LEFT(head, sib, elm, field); \ if (RB_RED_LEFT(elm, field)) \ RB_FLIP_RIGHT(sib, field); \ if (RB_RED_RIGHT(elm, field)) \ RB_FLIP_LEFT(parent, field); \ RB_BITS(elm, field) |= RB_RED_MASK; \ sib = elm; \ } \ RB_ROTATE_RIGHT(head, parent, sib, field); \ } \ break; \ } while ((parent = RB_PARENT(elm, field)) != NULL); \ } #define RB_GENERATE_REMOVE(name, type, field, attr) \ attr struct type * \ name##_RB_REMOVE(struct name *head, struct type *elm) \ { \ struct type *child, *old, *parent, *right; \ \ old = elm; \ parent = RB_PARENT(elm, field); \ right = RB_RIGHT(elm, field); \ if (RB_LEFT(elm, field) == NULL) \ elm = child = right; \ else if (right == NULL) \ elm = child = RB_LEFT(elm, field); \ else { \ if ((child = RB_LEFT(right, field)) == NULL) { \ child = RB_RIGHT(right, field); \ RB_RIGHT(old, field) = child; \ parent = elm = right; \ } else { \ do \ elm = child; \ while ((child = RB_LEFT(elm, field)) != NULL); \ child = RB_RIGHT(elm, field); \ parent = RB_PARENT(elm, field); \ RB_LEFT(parent, field) = child; \ RB_SET_PARENT(RB_RIGHT(old, field), elm, field);\ } \ RB_SET_PARENT(RB_LEFT(old, field), elm, field); \ elm->field = old->field; \ } \ RB_SWAP_CHILD(head, old, elm, field); \ if (child != NULL) \ RB_SET_PARENT(child, parent, field); \ if (parent != NULL) \ name##_RB_REMOVE_COLOR(head, parent, child); \ while (parent != NULL) { \ RB_AUGMENT(parent); \ parent = RB_PARENT(parent, field); \ } \ return (old); \ } #define RB_GENERATE_INSERT(name, type, field, cmp, attr) \ /* Inserts a node into the RB tree */ \ attr struct type * \ name##_RB_INSERT(struct name *head, struct type *elm) \ { \ struct type *tmp; \ struct type *parent = NULL; \ int comp = 0; \ tmp = RB_ROOT(head); \ while (tmp) { \ parent = tmp; \ comp = (cmp)(elm, parent); \ if (comp < 0) \ tmp = RB_LEFT(tmp, field); \ else if (comp > 0) \ tmp = RB_RIGHT(tmp, field); \ else \ return (tmp); \ } \ RB_SET(elm, parent, field); \ if (parent == NULL) \ RB_ROOT(head) = elm; \ else if (comp < 0) \ RB_LEFT(parent, field) = elm; \ else \ RB_RIGHT(parent, field) = elm; \ name##_RB_INSERT_COLOR(head, elm); \ while (elm != NULL) { \ RB_AUGMENT(elm); \ elm = RB_PARENT(elm, field); \ } \ return (NULL); \ } #define RB_GENERATE_FIND(name, type, field, cmp, attr) \ /* Finds the node with the same key as elm */ \ attr struct type * \ name##_RB_FIND(struct name *head, struct type *elm) \ { \ struct type *tmp = RB_ROOT(head); \ int comp; \ while (tmp) { \ comp = cmp(elm, tmp); \ if (comp < 0) \ tmp = RB_LEFT(tmp, field); \ else if (comp > 0) \ tmp = RB_RIGHT(tmp, field); \ else \ return (tmp); \ } \ return (NULL); \ } #define RB_GENERATE_NFIND(name, type, field, cmp, attr) \ /* Finds the first node greater than or equal to the search key */ \ attr struct type * \ name##_RB_NFIND(struct name *head, struct type *elm) \ { \ struct type *tmp = RB_ROOT(head); \ struct type *res = NULL; \ int comp; \ while (tmp) { \ comp = cmp(elm, tmp); \ if (comp < 0) { \ res = tmp; \ tmp = RB_LEFT(tmp, field); \ } \ else if (comp > 0) \ tmp = RB_RIGHT(tmp, field); \ else \ return (tmp); \ } \ return (res); \ } #define RB_GENERATE_NEXT(name, type, field, attr) \ /* ARGSUSED */ \ attr struct type * \ name##_RB_NEXT(struct type *elm) \ { \ if (RB_RIGHT(elm, field)) { \ elm = RB_RIGHT(elm, field); \ while (RB_LEFT(elm, field)) \ elm = RB_LEFT(elm, field); \ } else { \ if (RB_PARENT(elm, field) && \ (elm == RB_LEFT(RB_PARENT(elm, field), field))) \ elm = RB_PARENT(elm, field); \ else { \ while (RB_PARENT(elm, field) && \ (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\ elm = RB_PARENT(elm, field); \ elm = RB_PARENT(elm, field); \ } \ } \ return (elm); \ } #define RB_GENERATE_PREV(name, type, field, attr) \ /* ARGSUSED */ \ attr struct type * \ name##_RB_PREV(struct type *elm) \ { \ if (RB_LEFT(elm, field)) { \ elm = RB_LEFT(elm, field); \ while (RB_RIGHT(elm, field)) \ elm = RB_RIGHT(elm, field); \ } else { \ if (RB_PARENT(elm, field) && \ (elm == RB_RIGHT(RB_PARENT(elm, field), field))) \ elm = RB_PARENT(elm, field); \ else { \ while (RB_PARENT(elm, field) && \ (elm == RB_LEFT(RB_PARENT(elm, field), field)))\ elm = RB_PARENT(elm, field); \ elm = RB_PARENT(elm, field); \ } \ } \ return (elm); \ } #define RB_GENERATE_MINMAX(name, type, field, attr) \ attr struct type * \ name##_RB_MINMAX(struct name *head, int val) \ { \ struct type *tmp = RB_ROOT(head); \ struct type *parent = NULL; \ while (tmp) { \ parent = tmp; \ if (val < 0) \ tmp = RB_LEFT(tmp, field); \ else \ tmp = RB_RIGHT(tmp, field); \ } \ return (parent); \ } #define RB_GENERATE_REINSERT(name, type, field, cmp, attr) \ attr struct type * \ name##_RB_REINSERT(struct name *head, struct type *elm) \ { \ struct type *cmpelm; \ if (((cmpelm = RB_PREV(name, head, elm)) != NULL && \ cmp(cmpelm, elm) >= 0) || \ ((cmpelm = RB_NEXT(name, head, elm)) != NULL && \ cmp(elm, cmpelm) >= 0)) { \ /* XXXLAS: Remove/insert is heavy handed. */ \ RB_REMOVE(name, head, elm); \ return (RB_INSERT(name, head, elm)); \ } \ return (NULL); \ } \ #define RB_NEGINF -1 #define RB_INF 1 #define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) #define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) #define RB_FIND(name, x, y) name##_RB_FIND(x, y) #define RB_NFIND(name, x, y) name##_RB_NFIND(x, y) #define RB_NEXT(name, x, y) name##_RB_NEXT(y) #define RB_PREV(name, x, y) name##_RB_PREV(y) #define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) #define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF) #define RB_REINSERT(name, x, y) name##_RB_REINSERT(x, y) #define RB_FOREACH(x, name, head) \ for ((x) = RB_MIN(name, head); \ (x) != NULL; \ (x) = name##_RB_NEXT(x)) #define RB_FOREACH_FROM(x, name, y) \ for ((x) = (y); \ ((x) != NULL) && ((y) = name##_RB_NEXT(x), (x) != NULL); \ (x) = (y)) #define RB_FOREACH_SAFE(x, name, head, y) \ for ((x) = RB_MIN(name, head); \ ((x) != NULL) && ((y) = name##_RB_NEXT(x), (x) != NULL); \ (x) = (y)) #define RB_FOREACH_REVERSE(x, name, head) \ for ((x) = RB_MAX(name, head); \ (x) != NULL; \ (x) = name##_RB_PREV(x)) #define RB_FOREACH_REVERSE_FROM(x, name, y) \ for ((x) = (y); \ ((x) != NULL) && ((y) = name##_RB_PREV(x), (x) != NULL); \ (x) = (y)) #define RB_FOREACH_REVERSE_SAFE(x, name, head, y) \ for ((x) = RB_MAX(name, head); \ ((x) != NULL) && ((y) = name##_RB_PREV(x), (x) != NULL); \ (x) = (y)) #endif /* _SYS_TREE_H_ */