Index: head/sys/kern/kern_rmlock.c =================================================================== --- head/sys/kern/kern_rmlock.c (revision 367383) +++ head/sys/kern/kern_rmlock.c (revision 367384) @@ -1,1123 +1,1123 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007 Stephan Uphoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Machine independent bits of reader/writer lock implementation. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* * A cookie to mark destroyed rmlocks. This is stored in the head of * rm_activeReaders. */ #define RM_DESTROYED ((void *)0xdead) #define rm_destroyed(rm) \ (LIST_FIRST(&(rm)->rm_activeReaders) == RM_DESTROYED) #define RMPF_ONQUEUE 1 #define RMPF_SIGNAL 2 #ifndef INVARIANTS #define _rm_assert(c, what, file, line) #endif static void assert_rm(const struct lock_object *lock, int what); #ifdef DDB static void db_show_rm(const struct lock_object *lock); #endif static void lock_rm(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_rm(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_rm(struct lock_object *lock); struct lock_class lock_class_rm = { .lc_name = "rm", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE, .lc_assert = assert_rm, #ifdef DDB .lc_ddb_show = db_show_rm, #endif .lc_lock = lock_rm, .lc_unlock = unlock_rm, #ifdef KDTRACE_HOOKS .lc_owner = owner_rm, #endif }; struct lock_class lock_class_rm_sleepable = { .lc_name = "sleepable rm", .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE, .lc_assert = assert_rm, #ifdef DDB .lc_ddb_show = db_show_rm, #endif .lc_lock = lock_rm, .lc_unlock = unlock_rm, #ifdef KDTRACE_HOOKS .lc_owner = owner_rm, #endif }; static void assert_rm(const struct lock_object *lock, int what) { rm_assert((const struct rmlock *)lock, what); } static void lock_rm(struct lock_object *lock, uintptr_t how) { struct rmlock *rm; struct rm_priotracker *tracker; rm = (struct rmlock *)lock; if (how == 0) rm_wlock(rm); else { tracker = (struct rm_priotracker *)how; rm_rlock(rm, tracker); } } static uintptr_t unlock_rm(struct lock_object *lock) { struct thread *td; struct pcpu *pc; struct rmlock *rm; struct rm_queue *queue; struct rm_priotracker *tracker; uintptr_t how; rm = (struct rmlock *)lock; tracker = NULL; how = 0; rm_assert(rm, RA_LOCKED | RA_NOTRECURSED); if (rm_wowned(rm)) rm_wunlock(rm); else { /* * Find the right rm_priotracker structure for curthread. * The guarantee about its uniqueness is given by the fact * we already asserted the lock wasn't recursively acquired. */ critical_enter(); td = curthread; pc = get_pcpu(); for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue; queue = queue->rmq_next) { tracker = (struct rm_priotracker *)queue; if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td)) { how = (uintptr_t)tracker; break; } } KASSERT(tracker != NULL, ("rm_priotracker is non-NULL when lock held in read mode")); critical_exit(); rm_runlock(rm, tracker); } return (how); } #ifdef KDTRACE_HOOKS static int owner_rm(const struct lock_object *lock, struct thread **owner) { const struct rmlock *rm; struct lock_class *lc; rm = (const struct rmlock *)lock; lc = LOCK_CLASS(&rm->rm_wlock_object); return (lc->lc_owner(&rm->rm_wlock_object, owner)); } #endif static struct mtx rm_spinlock; MTX_SYSINIT(rm_spinlock, &rm_spinlock, "rm_spinlock", MTX_SPIN); /* * Add or remove tracker from per-cpu list. * * The per-cpu list can be traversed at any time in forward direction from an * interrupt on the *local* cpu. */ static void inline rm_tracker_add(struct pcpu *pc, struct rm_priotracker *tracker) { struct rm_queue *next; /* Initialize all tracker pointers */ tracker->rmp_cpuQueue.rmq_prev = &pc->pc_rm_queue; next = pc->pc_rm_queue.rmq_next; tracker->rmp_cpuQueue.rmq_next = next; /* rmq_prev is not used during froward traversal. */ next->rmq_prev = &tracker->rmp_cpuQueue; /* Update pointer to first element. */ pc->pc_rm_queue.rmq_next = &tracker->rmp_cpuQueue; } /* * Return a count of the number of trackers the thread 'td' already * has on this CPU for the lock 'rm'. */ static int rm_trackers_present(const struct pcpu *pc, const struct rmlock *rm, const struct thread *td) { struct rm_queue *queue; struct rm_priotracker *tracker; int count; count = 0; for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue; queue = queue->rmq_next) { tracker = (struct rm_priotracker *)queue; if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td)) count++; } return (count); } static void inline rm_tracker_remove(struct pcpu *pc, struct rm_priotracker *tracker) { struct rm_queue *next, *prev; next = tracker->rmp_cpuQueue.rmq_next; prev = tracker->rmp_cpuQueue.rmq_prev; /* Not used during forward traversal. */ next->rmq_prev = prev; /* Remove from list. */ prev->rmq_next = next; } static void rm_cleanIPI(void *arg) { struct pcpu *pc; struct rmlock *rm = arg; struct rm_priotracker *tracker; struct rm_queue *queue; pc = get_pcpu(); for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue; queue = queue->rmq_next) { tracker = (struct rm_priotracker *)queue; if (tracker->rmp_rmlock == rm && tracker->rmp_flags == 0) { tracker->rmp_flags = RMPF_ONQUEUE; mtx_lock_spin(&rm_spinlock); LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker, rmp_qentry); mtx_unlock_spin(&rm_spinlock); } } } void rm_init_flags(struct rmlock *rm, const char *name, int opts) { struct lock_class *lc; int liflags, xflags; liflags = 0; if (!(opts & RM_NOWITNESS)) liflags |= LO_WITNESS; if (opts & RM_RECURSE) liflags |= LO_RECURSABLE; if (opts & RM_NEW) liflags |= LO_NEW; rm->rm_writecpus = all_cpus; LIST_INIT(&rm->rm_activeReaders); if (opts & RM_SLEEPABLE) { liflags |= LO_SLEEPABLE; lc = &lock_class_rm_sleepable; xflags = (opts & RM_NEW ? SX_NEW : 0); sx_init_flags(&rm->rm_lock_sx, "rmlock_sx", xflags | SX_NOWITNESS); } else { lc = &lock_class_rm; xflags = (opts & RM_NEW ? MTX_NEW : 0); mtx_init(&rm->rm_lock_mtx, name, "rmlock_mtx", xflags | MTX_NOWITNESS); } lock_init(&rm->lock_object, lc, name, NULL, liflags); } void rm_init(struct rmlock *rm, const char *name) { rm_init_flags(rm, name, 0); } void rm_destroy(struct rmlock *rm) { rm_assert(rm, RA_UNLOCKED); LIST_FIRST(&rm->rm_activeReaders) = RM_DESTROYED; if (rm->lock_object.lo_flags & LO_SLEEPABLE) sx_destroy(&rm->rm_lock_sx); else mtx_destroy(&rm->rm_lock_mtx); lock_destroy(&rm->lock_object); } int rm_wowned(const struct rmlock *rm) { if (rm->lock_object.lo_flags & LO_SLEEPABLE) return (sx_xlocked(&rm->rm_lock_sx)); else return (mtx_owned(&rm->rm_lock_mtx)); } void rm_sysinit(void *arg) { struct rm_args *args; args = arg; rm_init_flags(args->ra_rm, args->ra_desc, args->ra_flags); } static __noinline int _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock) { struct pcpu *pc; critical_enter(); pc = get_pcpu(); /* Check if we just need to do a proper critical_exit. */ if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) { critical_exit(); return (1); } /* Remove our tracker from the per-cpu list. */ rm_tracker_remove(pc, tracker); /* Check to see if the IPI granted us the lock after all. */ if (tracker->rmp_flags) { /* Just add back tracker - we hold the lock. */ rm_tracker_add(pc, tracker); critical_exit(); return (1); } /* * We allow readers to acquire a lock even if a writer is blocked if * the lock is recursive and the reader already holds the lock. */ if ((rm->lock_object.lo_flags & LO_RECURSABLE) != 0) { /* * Just grant the lock if this thread already has a tracker * for this lock on the per-cpu queue. */ if (rm_trackers_present(pc, rm, curthread) != 0) { mtx_lock_spin(&rm_spinlock); LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker, rmp_qentry); tracker->rmp_flags = RMPF_ONQUEUE; mtx_unlock_spin(&rm_spinlock); rm_tracker_add(pc, tracker); critical_exit(); return (1); } } sched_unpin(); critical_exit(); if (trylock) { if (rm->lock_object.lo_flags & LO_SLEEPABLE) { if (!sx_try_xlock(&rm->rm_lock_sx)) return (0); } else { if (!mtx_trylock(&rm->rm_lock_mtx)) return (0); } } else { if (rm->lock_object.lo_flags & LO_SLEEPABLE) { THREAD_SLEEPING_OK(); sx_xlock(&rm->rm_lock_sx); THREAD_NO_SLEEPING(); } else mtx_lock(&rm->rm_lock_mtx); } critical_enter(); pc = get_pcpu(); CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus); rm_tracker_add(pc, tracker); sched_pin(); critical_exit(); if (rm->lock_object.lo_flags & LO_SLEEPABLE) sx_xunlock(&rm->rm_lock_sx); else mtx_unlock(&rm->rm_lock_mtx); return (1); } int _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock) { struct thread *td = curthread; struct pcpu *pc; if (SCHEDULER_STOPPED()) return (1); tracker->rmp_flags = 0; tracker->rmp_thread = td; tracker->rmp_rmlock = rm; if (rm->lock_object.lo_flags & LO_SLEEPABLE) THREAD_NO_SLEEPING(); td->td_critnest++; /* critical_enter(); */ __compiler_membar(); pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */ rm_tracker_add(pc, tracker); sched_pin(); __compiler_membar(); td->td_critnest--; /* * Fast path to combine two common conditions into a single * conditional jump. */ if (__predict_true(0 == (td->td_owepreempt | CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)))) return (1); /* We do not have a read token and need to acquire one. */ return _rm_rlock_hard(rm, tracker, trylock); } static __noinline void _rm_unlock_hard(struct thread *td,struct rm_priotracker *tracker) { if (td->td_owepreempt) { td->td_critnest++; critical_exit(); } if (!tracker->rmp_flags) return; mtx_lock_spin(&rm_spinlock); LIST_REMOVE(tracker, rmp_qentry); if (tracker->rmp_flags & RMPF_SIGNAL) { struct rmlock *rm; struct turnstile *ts; rm = tracker->rmp_rmlock; turnstile_chain_lock(&rm->lock_object); mtx_unlock_spin(&rm_spinlock); ts = turnstile_lookup(&rm->lock_object); turnstile_signal(ts, TS_EXCLUSIVE_QUEUE); turnstile_unpend(ts); turnstile_chain_unlock(&rm->lock_object); } else mtx_unlock_spin(&rm_spinlock); } void _rm_runlock(struct rmlock *rm, struct rm_priotracker *tracker) { struct pcpu *pc; struct thread *td = tracker->rmp_thread; if (SCHEDULER_STOPPED()) return; td->td_critnest++; /* critical_enter(); */ pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */ rm_tracker_remove(pc, tracker); td->td_critnest--; sched_unpin(); if (rm->lock_object.lo_flags & LO_SLEEPABLE) THREAD_SLEEPING_OK(); if (__predict_true(0 == (td->td_owepreempt | tracker->rmp_flags))) return; _rm_unlock_hard(td, tracker); } void _rm_wlock(struct rmlock *rm) { struct rm_priotracker *prio; struct turnstile *ts; cpuset_t readcpus; if (SCHEDULER_STOPPED()) return; if (rm->lock_object.lo_flags & LO_SLEEPABLE) sx_xlock(&rm->rm_lock_sx); else mtx_lock(&rm->rm_lock_mtx); if (CPU_CMP(&rm->rm_writecpus, &all_cpus)) { /* Get all read tokens back */ readcpus = all_cpus; CPU_ANDNOT(&readcpus, &rm->rm_writecpus); rm->rm_writecpus = all_cpus; /* * Assumes rm->rm_writecpus update is visible on other CPUs * before rm_cleanIPI is called. */ #ifdef SMP smp_rendezvous_cpus(readcpus, smp_no_rendezvous_barrier, rm_cleanIPI, smp_no_rendezvous_barrier, rm); #else rm_cleanIPI(rm); #endif mtx_lock_spin(&rm_spinlock); while ((prio = LIST_FIRST(&rm->rm_activeReaders)) != NULL) { ts = turnstile_trywait(&rm->lock_object); prio->rmp_flags = RMPF_ONQUEUE | RMPF_SIGNAL; mtx_unlock_spin(&rm_spinlock); turnstile_wait(ts, prio->rmp_thread, TS_EXCLUSIVE_QUEUE); mtx_lock_spin(&rm_spinlock); } mtx_unlock_spin(&rm_spinlock); } } void _rm_wunlock(struct rmlock *rm) { if (rm->lock_object.lo_flags & LO_SLEEPABLE) sx_xunlock(&rm->rm_lock_sx); else mtx_unlock(&rm->rm_lock_mtx); } #if LOCK_DEBUG > 0 void _rm_wlock_debug(struct rmlock *rm, const char *file, int line) { if (SCHEDULER_STOPPED()) return; KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rm_wlock() by idle thread %p on rmlock %s @ %s:%d", curthread, rm->lock_object.lo_name, file, line)); KASSERT(!rm_destroyed(rm), ("rm_wlock() of destroyed rmlock @ %s:%d", file, line)); _rm_assert(rm, RA_UNLOCKED, file, line); WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); _rm_wlock(rm); LOCK_LOG_LOCK("RMWLOCK", &rm->lock_object, 0, 0, file, line); WITNESS_LOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } void _rm_wunlock_debug(struct rmlock *rm, const char *file, int line) { if (SCHEDULER_STOPPED()) return; KASSERT(!rm_destroyed(rm), ("rm_wunlock() of destroyed rmlock @ %s:%d", file, line)); _rm_assert(rm, RA_WLOCKED, file, line); WITNESS_UNLOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("RMWUNLOCK", &rm->lock_object, 0, 0, file, line); _rm_wunlock(rm); TD_LOCKS_DEC(curthread); } int _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker, int trylock, const char *file, int line) { if (SCHEDULER_STOPPED()) return (1); #ifdef INVARIANTS if (!(rm->lock_object.lo_flags & LO_RECURSABLE) && !trylock) { critical_enter(); KASSERT(rm_trackers_present(get_pcpu(), rm, curthread) == 0, ("rm_rlock: recursed on non-recursive rmlock %s @ %s:%d\n", rm->lock_object.lo_name, file, line)); critical_exit(); } #endif KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rm_rlock() by idle thread %p on rmlock %s @ %s:%d", curthread, rm->lock_object.lo_name, file, line)); KASSERT(!rm_destroyed(rm), ("rm_rlock() of destroyed rmlock @ %s:%d", file, line)); if (!trylock) { KASSERT(!rm_wowned(rm), ("rm_rlock: wlock already held for %s @ %s:%d", rm->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_NOSLEEP, file, line, NULL); } if (_rm_rlock(rm, tracker, trylock)) { if (trylock) LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 1, file, line); else LOCK_LOG_LOCK("RMRLOCK", &rm->lock_object, 0, 0, file, line); WITNESS_LOCK(&rm->lock_object, LOP_NOSLEEP, file, line); TD_LOCKS_INC(curthread); return (1); } else if (trylock) LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 0, file, line); return (0); } void _rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker, const char *file, int line) { if (SCHEDULER_STOPPED()) return; KASSERT(!rm_destroyed(rm), ("rm_runlock() of destroyed rmlock @ %s:%d", file, line)); _rm_assert(rm, RA_RLOCKED, file, line); WITNESS_UNLOCK(&rm->lock_object, 0, file, line); LOCK_LOG_LOCK("RMRUNLOCK", &rm->lock_object, 0, 0, file, line); _rm_runlock(rm, tracker); TD_LOCKS_DEC(curthread); } #else /* * Just strip out file and line arguments if no lock debugging is enabled in * the kernel - we are called from a kernel module. */ void _rm_wlock_debug(struct rmlock *rm, const char *file, int line) { _rm_wlock(rm); } void _rm_wunlock_debug(struct rmlock *rm, const char *file, int line) { _rm_wunlock(rm); } int _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker, int trylock, const char *file, int line) { return _rm_rlock(rm, tracker, trylock); } void _rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker, const char *file, int line) { _rm_runlock(rm, tracker); } #endif #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef _rm_assert #endif /* * Note that this does not need to use witness_assert() for read lock * assertions since an exact count of read locks held by this thread * is computable. */ void _rm_assert(const struct rmlock *rm, int what, const char *file, int line) { int count; if (SCHEDULER_STOPPED()) return; switch (what) { case RA_LOCKED: case RA_LOCKED | RA_RECURSED: case RA_LOCKED | RA_NOTRECURSED: case RA_RLOCKED: case RA_RLOCKED | RA_RECURSED: case RA_RLOCKED | RA_NOTRECURSED: /* * Handle the write-locked case. Unlike other * primitives, writers can never recurse. */ if (rm_wowned(rm)) { if (what & RA_RLOCKED) panic("Lock %s exclusively locked @ %s:%d\n", rm->lock_object.lo_name, file, line); if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rm->lock_object.lo_name, file, line); break; } critical_enter(); count = rm_trackers_present(get_pcpu(), rm, curthread); critical_exit(); if (count == 0) panic("Lock %s not %slocked @ %s:%d\n", rm->lock_object.lo_name, (what & RA_RLOCKED) ? "read " : "", file, line); if (count > 1) { if (what & RA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", rm->lock_object.lo_name, file, line); } else if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rm->lock_object.lo_name, file, line); break; case RA_WLOCKED: if (!rm_wowned(rm)) panic("Lock %s not exclusively locked @ %s:%d\n", rm->lock_object.lo_name, file, line); break; case RA_UNLOCKED: if (rm_wowned(rm)) panic("Lock %s exclusively locked @ %s:%d\n", rm->lock_object.lo_name, file, line); critical_enter(); count = rm_trackers_present(get_pcpu(), rm, curthread); critical_exit(); if (count != 0) panic("Lock %s read locked @ %s:%d\n", rm->lock_object.lo_name, file, line); break; default: panic("Unknown rm lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB static void print_tracker(struct rm_priotracker *tr) { struct thread *td; td = tr->rmp_thread; db_printf(" thread %p (tid %d, pid %d, \"%s\") {", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (tr->rmp_flags & RMPF_ONQUEUE) { db_printf("ONQUEUE"); if (tr->rmp_flags & RMPF_SIGNAL) db_printf(",SIGNAL"); } else db_printf("0"); db_printf("}\n"); } static void db_show_rm(const struct lock_object *lock) { struct rm_priotracker *tr; struct rm_queue *queue; const struct rmlock *rm; struct lock_class *lc; struct pcpu *pc; rm = (const struct rmlock *)lock; db_printf(" writecpus: "); ddb_display_cpuset(__DEQUALIFY(const cpuset_t *, &rm->rm_writecpus)); db_printf("\n"); db_printf(" per-CPU readers:\n"); STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue; queue = queue->rmq_next) { tr = (struct rm_priotracker *)queue; if (tr->rmp_rmlock == rm) print_tracker(tr); } db_printf(" active readers:\n"); LIST_FOREACH(tr, &rm->rm_activeReaders, rmp_qentry) print_tracker(tr); lc = LOCK_CLASS(&rm->rm_wlock_object); db_printf("Backing write-lock (%s):\n", lc->lc_name); lc->lc_ddb_show(&rm->rm_wlock_object); } #endif /* * Read-mostly sleepable locks. * * These primitives allow both readers and writers to sleep. However, neither * readers nor writers are tracked and subsequently there is no priority * propagation. * * They are intended to be only used when write-locking is almost never needed * (e.g., they can guard against unloading a kernel module) while read-locking * happens all the time. * * Concurrent writers take turns taking the lock while going off cpu. If this is * of concern for your usecase, this is not the right primitive. * * Neither rms_rlock nor rms_runlock use fences. Instead compiler barriers are * inserted to prevert reordering of generated code. Execution ordering is * provided with the use of an IPI handler. * * No attempt is made to track which CPUs read locked at least once, * consequently write locking sends IPIs to all of them. This will become a * problem at some point. The easiest way to lessen it is to provide a bitmap. */ #define RMS_NOOWNER ((void *)0x1) #define RMS_TRANSIENT ((void *)0x2) #define RMS_FLAGMASK 0xf void rms_init(struct rmslock *rms, const char *name) { rms->owner = RMS_NOOWNER; rms->writers = 0; rms->readers = 0; mtx_init(&rms->mtx, name, NULL, MTX_DEF | MTX_NEW); - rms->readers_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); - rms->readers_influx = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); + rms->readers_pcpu = uma_zalloc_pcpu(pcpu_zone_4, M_WAITOK | M_ZERO); + rms->readers_influx = uma_zalloc_pcpu(pcpu_zone_4, M_WAITOK | M_ZERO); } void rms_destroy(struct rmslock *rms) { MPASS(rms->writers == 0); MPASS(rms->readers == 0); mtx_destroy(&rms->mtx); - uma_zfree_pcpu(pcpu_zone_int, rms->readers_pcpu); - uma_zfree_pcpu(pcpu_zone_int, rms->readers_influx); + uma_zfree_pcpu(pcpu_zone_4, rms->readers_pcpu); + uma_zfree_pcpu(pcpu_zone_4, rms->readers_influx); } static void __noinline rms_rlock_fallback(struct rmslock *rms) { zpcpu_set_protected(rms->readers_influx, 0); critical_exit(); mtx_lock(&rms->mtx); MPASS(*zpcpu_get(rms->readers_pcpu) == 0); while (rms->writers > 0) msleep(&rms->readers, &rms->mtx, PUSER - 1, mtx_name(&rms->mtx), 0); critical_enter(); zpcpu_add_protected(rms->readers_pcpu, 1); mtx_unlock(&rms->mtx); critical_exit(); } void rms_rlock(struct rmslock *rms) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); MPASS(atomic_load_ptr(&rms->owner) != curthread); critical_enter(); zpcpu_set_protected(rms->readers_influx, 1); __compiler_membar(); if (__predict_false(rms->writers > 0)) { rms_rlock_fallback(rms); return; } __compiler_membar(); zpcpu_add_protected(rms->readers_pcpu, 1); __compiler_membar(); zpcpu_set_protected(rms->readers_influx, 0); critical_exit(); } int rms_try_rlock(struct rmslock *rms) { MPASS(atomic_load_ptr(&rms->owner) != curthread); critical_enter(); zpcpu_set_protected(rms->readers_influx, 1); __compiler_membar(); if (__predict_false(rms->writers > 0)) { __compiler_membar(); zpcpu_set_protected(rms->readers_influx, 0); critical_exit(); return (0); } __compiler_membar(); zpcpu_add_protected(rms->readers_pcpu, 1); __compiler_membar(); zpcpu_set_protected(rms->readers_influx, 0); critical_exit(); return (1); } static void __noinline rms_runlock_fallback(struct rmslock *rms) { zpcpu_set_protected(rms->readers_influx, 0); critical_exit(); mtx_lock(&rms->mtx); MPASS(*zpcpu_get(rms->readers_pcpu) == 0); MPASS(rms->writers > 0); MPASS(rms->readers > 0); rms->readers--; if (rms->readers == 0) wakeup_one(&rms->writers); mtx_unlock(&rms->mtx); } void rms_runlock(struct rmslock *rms) { critical_enter(); zpcpu_set_protected(rms->readers_influx, 1); __compiler_membar(); if (__predict_false(rms->writers > 0)) { rms_runlock_fallback(rms); return; } __compiler_membar(); zpcpu_sub_protected(rms->readers_pcpu, 1); __compiler_membar(); zpcpu_set_protected(rms->readers_influx, 0); critical_exit(); } struct rmslock_ipi { struct rmslock *rms; struct smp_rendezvous_cpus_retry_arg srcra; }; static void rms_action_func(void *arg) { struct rmslock_ipi *rmsipi; struct rmslock *rms; int readers; rmsipi = __containerof(arg, struct rmslock_ipi, srcra); rms = rmsipi->rms; if (*zpcpu_get(rms->readers_influx)) return; readers = zpcpu_replace(rms->readers_pcpu, 0); if (readers != 0) atomic_add_int(&rms->readers, readers); smp_rendezvous_cpus_done(arg); } static void rms_wait_func(void *arg, int cpu) { struct rmslock_ipi *rmsipi; struct rmslock *rms; int *in_op; rmsipi = __containerof(arg, struct rmslock_ipi, srcra); rms = rmsipi->rms; in_op = zpcpu_get_cpu(rms->readers_influx, cpu); while (atomic_load_int(in_op)) cpu_spinwait(); } static void rms_wlock_switch(struct rmslock *rms) { struct rmslock_ipi rmsipi; MPASS(rms->readers == 0); MPASS(rms->writers == 1); rmsipi.rms = rms; smp_rendezvous_cpus_retry(all_cpus, smp_no_rendezvous_barrier, rms_action_func, smp_no_rendezvous_barrier, rms_wait_func, &rmsipi.srcra); } void rms_wlock(struct rmslock *rms) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); MPASS(atomic_load_ptr(&rms->owner) != curthread); mtx_lock(&rms->mtx); rms->writers++; if (rms->writers > 1) { msleep(&rms->owner, &rms->mtx, (PUSER - 1), mtx_name(&rms->mtx), 0); MPASS(rms->readers == 0); KASSERT(rms->owner == RMS_TRANSIENT, ("%s: unexpected owner value %p\n", __func__, rms->owner)); goto out_grab; } KASSERT(rms->owner == RMS_NOOWNER, ("%s: unexpected owner value %p\n", __func__, rms->owner)); rms_wlock_switch(rms); if (rms->readers > 0) { msleep(&rms->writers, &rms->mtx, (PUSER - 1), mtx_name(&rms->mtx), 0); } out_grab: rms->owner = curthread; mtx_unlock(&rms->mtx); MPASS(rms->readers == 0); } void rms_wunlock(struct rmslock *rms) { mtx_lock(&rms->mtx); KASSERT(rms->owner == curthread, ("%s: unexpected owner value %p\n", __func__, rms->owner)); MPASS(rms->writers >= 1); MPASS(rms->readers == 0); rms->writers--; if (rms->writers > 0) { wakeup_one(&rms->owner); rms->owner = RMS_TRANSIENT; } else { wakeup(&rms->readers); rms->owner = RMS_NOOWNER; } mtx_unlock(&rms->mtx); } void rms_unlock(struct rmslock *rms) { if (rms_wowned(rms)) rms_wunlock(rms); else rms_runlock(rms); } Index: head/sys/kern/subr_counter.c =================================================================== --- head/sys/kern/subr_counter.c (revision 367383) +++ head/sys/kern/subr_counter.c (revision 367384) @@ -1,192 +1,192 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #define IN_SUBR_COUNTER_C #include void counter_u64_zero(counter_u64_t c) { counter_u64_zero_inline(c); } uint64_t counter_u64_fetch(counter_u64_t c) { return (counter_u64_fetch_inline(c)); } counter_u64_t counter_u64_alloc(int flags) { - return (uma_zalloc_pcpu(pcpu_zone_64, flags | M_ZERO)); + return (uma_zalloc_pcpu(pcpu_zone_8, flags | M_ZERO)); } void counter_u64_free(counter_u64_t c) { - uma_zfree_pcpu(pcpu_zone_64, c); + uma_zfree_pcpu(pcpu_zone_8, c); } int sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS) { uint64_t out; int error; out = counter_u64_fetch(*(counter_u64_t *)arg1); error = SYSCTL_OUT(req, &out, sizeof(uint64_t)); if (error || !req->newptr) return (error); /* * Any write attempt to a counter zeroes it. */ counter_u64_zero(*(counter_u64_t *)arg1); return (0); } int sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS) { uint64_t *out; int error; out = malloc(arg2 * sizeof(uint64_t), M_TEMP, M_WAITOK); for (int i = 0; i < arg2; i++) out[i] = counter_u64_fetch(((counter_u64_t *)arg1)[i]); error = SYSCTL_OUT(req, out, arg2 * sizeof(uint64_t)); free(out, M_TEMP); if (error || !req->newptr) return (error); /* * Any write attempt to a counter zeroes it. */ for (int i = 0; i < arg2; i++) counter_u64_zero(((counter_u64_t *)arg1)[i]); return (0); } /* * MP-friendly version of ppsratecheck(). * * Returns non-negative if we are in the rate, negative otherwise. * 0 - rate limit not reached. * -1 - rate limit reached. * >0 - rate limit was reached before, and was just reset. The return value * is number of events since last reset. */ int64_t counter_ratecheck(struct counter_rate *cr, int64_t limit) { int64_t val; int now; val = cr->cr_over; now = ticks; if ((u_int)(now - cr->cr_ticks) >= hz) { /* * Time to clear the structure, we are in the next second. * First try unlocked read, and then proceed with atomic. */ if ((cr->cr_lock == 0) && atomic_cmpset_acq_int(&cr->cr_lock, 0, 1)) { /* * Check if other thread has just went through the * reset sequence before us. */ if ((u_int)(now - cr->cr_ticks) >= hz) { val = counter_u64_fetch(cr->cr_rate); counter_u64_zero(cr->cr_rate); cr->cr_over = 0; cr->cr_ticks = now; if (val <= limit) val = 0; } atomic_store_rel_int(&cr->cr_lock, 0); } else /* * We failed to lock, in this case other thread may * be running counter_u64_zero(), so it is not safe * to do an update, we skip it. */ return (val); } counter_u64_add(cr->cr_rate, 1); if (cr->cr_over != 0) return (-1); if (counter_u64_fetch(cr->cr_rate) > limit) val = cr->cr_over = -1; return (val); } void counter_u64_sysinit(void *arg) { counter_u64_t *cp; cp = arg; *cp = counter_u64_alloc(M_WAITOK); } void counter_u64_sysuninit(void *arg) { counter_u64_t *cp; cp = arg; counter_u64_free(*cp); } Index: head/sys/kern/subr_pcpu.c =================================================================== --- head/sys/kern/subr_pcpu.c (revision 367383) +++ head/sys/kern/subr_pcpu.c (revision 367384) @@ -1,419 +1,417 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001 Wind River Systems, Inc. * All rights reserved. * Written by: John Baldwin * * Copyright (c) 2009 Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module provides MI support for per-cpu data. * * Each architecture determines the mapping of logical CPU IDs to physical * CPUs. The requirements of this mapping are as follows: * - Logical CPU IDs must reside in the range 0 ... MAXCPU - 1. * - The mapping is not required to be dense. That is, there may be * gaps in the mappings. * - The platform sets the value of MAXCPU in . * - It is suggested, but not required, that in the non-SMP case, the * platform define MAXCPU to be 1 and define the logical ID of the * sole CPU as 0. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PCPU, "Per-cpu", "Per-cpu resource accouting."); struct dpcpu_free { uintptr_t df_start; int df_len; TAILQ_ENTRY(dpcpu_free) df_link; }; DPCPU_DEFINE_STATIC(char, modspace[DPCPU_MODMIN] __aligned(__alignof(void *))); static TAILQ_HEAD(, dpcpu_free) dpcpu_head = TAILQ_HEAD_INITIALIZER(dpcpu_head); static struct sx dpcpu_lock; uintptr_t dpcpu_off[MAXCPU]; struct pcpu *cpuid_to_pcpu[MAXCPU]; struct cpuhead cpuhead = STAILQ_HEAD_INITIALIZER(cpuhead); /* * Initialize the MI portions of a struct pcpu. */ void pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { bzero(pcpu, size); KASSERT(cpuid >= 0 && cpuid < MAXCPU, ("pcpu_init: invalid cpuid %d", cpuid)); pcpu->pc_cpuid = cpuid; cpuid_to_pcpu[cpuid] = pcpu; STAILQ_INSERT_TAIL(&cpuhead, pcpu, pc_allcpu); cpu_pcpu_init(pcpu, cpuid, size); pcpu->pc_rm_queue.rmq_next = &pcpu->pc_rm_queue; pcpu->pc_rm_queue.rmq_prev = &pcpu->pc_rm_queue; pcpu->pc_zpcpu_offset = zpcpu_offset_cpu(cpuid); } void dpcpu_init(void *dpcpu, int cpuid) { struct pcpu *pcpu; pcpu = pcpu_find(cpuid); pcpu->pc_dynamic = (uintptr_t)dpcpu - DPCPU_START; /* * Initialize defaults from our linker section. */ memcpy(dpcpu, (void *)DPCPU_START, DPCPU_BYTES); /* * Place it in the global pcpu offset array. */ dpcpu_off[cpuid] = pcpu->pc_dynamic; } static void dpcpu_startup(void *dummy __unused) { struct dpcpu_free *df; df = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO); df->df_start = (uintptr_t)&DPCPU_NAME(modspace); df->df_len = DPCPU_MODMIN; TAILQ_INSERT_HEAD(&dpcpu_head, df, df_link); sx_init(&dpcpu_lock, "dpcpu alloc lock"); } SYSINIT(dpcpu, SI_SUB_KLD, SI_ORDER_FIRST, dpcpu_startup, NULL); /* - * UMA_PCPU_ZONE zones, that are available for all kernel - * consumers. Right now 64 bit zone is used for counter(9) - * and int zone is used for mount point counters. + * UMA_ZONE_PCPU zones for general kernel use. */ -uma_zone_t pcpu_zone_int; -uma_zone_t pcpu_zone_64; +uma_zone_t pcpu_zone_4; +uma_zone_t pcpu_zone_8; static void pcpu_zones_startup(void) { - pcpu_zone_int = uma_zcreate("int pcpu", sizeof(int), + pcpu_zone_4 = uma_zcreate("pcpu-4", sizeof(uint32_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); - pcpu_zone_64 = uma_zcreate("64 pcpu", sizeof(uint64_t), + pcpu_zone_8 = uma_zcreate("pcpu-8", sizeof(uint64_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); } SYSINIT(pcpu_zones, SI_SUB_COUNTER, SI_ORDER_FIRST, pcpu_zones_startup, NULL); /* * First-fit extent based allocator for allocating space in the per-cpu * region reserved for modules. This is only intended for use by the * kernel linkers to place module linker sets. */ void * dpcpu_alloc(int size) { struct dpcpu_free *df; void *s; s = NULL; size = roundup2(size, sizeof(void *)); sx_xlock(&dpcpu_lock); TAILQ_FOREACH(df, &dpcpu_head, df_link) { if (df->df_len < size) continue; if (df->df_len == size) { s = (void *)df->df_start; TAILQ_REMOVE(&dpcpu_head, df, df_link); free(df, M_PCPU); break; } s = (void *)df->df_start; df->df_len -= size; df->df_start = df->df_start + size; break; } sx_xunlock(&dpcpu_lock); return (s); } /* * Free dynamic per-cpu space at module unload time. */ void dpcpu_free(void *s, int size) { struct dpcpu_free *df; struct dpcpu_free *dn; uintptr_t start; uintptr_t end; size = roundup2(size, sizeof(void *)); start = (uintptr_t)s; end = start + size; /* * Free a region of space and merge it with as many neighbors as * possible. Keeping the list sorted simplifies this operation. */ sx_xlock(&dpcpu_lock); TAILQ_FOREACH(df, &dpcpu_head, df_link) { if (df->df_start > end) break; /* * If we expand at the end of an entry we may have to * merge it with the one following it as well. */ if (df->df_start + df->df_len == start) { df->df_len += size; dn = TAILQ_NEXT(df, df_link); if (df->df_start + df->df_len == dn->df_start) { df->df_len += dn->df_len; TAILQ_REMOVE(&dpcpu_head, dn, df_link); free(dn, M_PCPU); } sx_xunlock(&dpcpu_lock); return; } if (df->df_start == end) { df->df_start = start; df->df_len += size; sx_xunlock(&dpcpu_lock); return; } } dn = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO); dn->df_start = start; dn->df_len = size; if (df) TAILQ_INSERT_BEFORE(df, dn, df_link); else TAILQ_INSERT_TAIL(&dpcpu_head, dn, df_link); sx_xunlock(&dpcpu_lock); } /* * Initialize the per-cpu storage from an updated linker-set region. */ void dpcpu_copy(void *s, int size) { #ifdef SMP uintptr_t dpcpu; int i; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; memcpy((void *)(dpcpu + (uintptr_t)s), s, size); } #else memcpy((void *)(dpcpu_off[0] + (uintptr_t)s), s, size); #endif } /* * Destroy a struct pcpu. */ void pcpu_destroy(struct pcpu *pcpu) { STAILQ_REMOVE(&cpuhead, pcpu, pcpu, pc_allcpu); cpuid_to_pcpu[pcpu->pc_cpuid] = NULL; dpcpu_off[pcpu->pc_cpuid] = 0; } /* * Locate a struct pcpu by cpu id. */ struct pcpu * pcpu_find(u_int cpuid) { return (cpuid_to_pcpu[cpuid]); } int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; int64_t count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(int64_t *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; long count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(long *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; int count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(int *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } #ifdef DDB DB_SHOW_COMMAND(dpcpu_off, db_show_dpcpu_off) { int id; CPU_FOREACH(id) { db_printf("dpcpu_off[%2d] = 0x%jx (+ DPCPU_START = %p)\n", id, (uintmax_t)dpcpu_off[id], (void *)(uintptr_t)(dpcpu_off[id] + DPCPU_START)); } } static void show_pcpu(struct pcpu *pc) { struct thread *td; db_printf("cpuid = %d\n", pc->pc_cpuid); db_printf("dynamic pcpu = %p\n", (void *)pc->pc_dynamic); db_printf("curthread = "); td = pc->pc_curthread; if (td != NULL) db_printf("%p: pid %d tid %d critnest %d \"%s\"\n", td, td->td_proc->p_pid, td->td_tid, td->td_critnest, td->td_name); else db_printf("none\n"); db_printf("curpcb = %p\n", pc->pc_curpcb); db_printf("fpcurthread = "); td = pc->pc_fpcurthread; if (td != NULL) db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid, td->td_name); else db_printf("none\n"); db_printf("idlethread = "); td = pc->pc_idlethread; if (td != NULL) db_printf("%p: tid %d \"%s\"\n", td, td->td_tid, td->td_name); else db_printf("none\n"); db_show_mdpcpu(pc); #ifdef VIMAGE db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet); #endif #ifdef WITNESS db_printf("spin locks held:\n"); witness_list_locks(&pc->pc_spinlocks, db_printf); #endif } DB_SHOW_COMMAND(pcpu, db_show_pcpu) { struct pcpu *pc; int id; if (have_addr) id = ((addr >> 4) % 16) * 10 + (addr % 16); else id = PCPU_GET(cpuid); pc = pcpu_find(id); if (pc == NULL) { db_printf("CPU %d not found\n", id); return; } show_pcpu(pc); } DB_SHOW_ALL_COMMAND(pcpu, db_show_cpu_all) { struct pcpu *pc; int id; db_printf("Current CPU: %d\n\n", PCPU_GET(cpuid)); CPU_FOREACH(id) { pc = pcpu_find(id); if (pc != NULL) { show_pcpu(pc); db_printf("\n"); } } } DB_SHOW_ALIAS(allpcpu, db_show_cpu_all); #endif Index: head/sys/kern/vfs_mount.c =================================================================== --- head/sys/kern/vfs_mount.c (revision 367383) +++ head/sys/kern/vfs_mount.c (revision 367384) @@ -1,2509 +1,2509 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, uint64_t fsflags, struct vfsoptlist **optlist); static void free_mntarg(struct mntarg *ma); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); static bool default_autoro = false; SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); static uma_zone_t mount_zone; /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx_padalign __exclusive_cache_line mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); EVENTHANDLER_LIST_DEFINE(vfs_mounted); EVENTHANDLER_LIST_DEFINE(vfs_unmounted); static void mount_devctl_event(const char *type, struct mount *mp, bool donew); /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "ro", "rw", "nosuid", "noexec", NULL }; static int mount_init(void *mem, int size, int flags) { struct mount *mp; mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); - mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_4, M_WAITOK | M_ZERO); - mp->mnt_ref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + mp->mnt_ref_pcpu = uma_zalloc_pcpu(pcpu_zone_4, M_WAITOK | M_ZERO); - mp->mnt_lockref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + mp->mnt_lockref_pcpu = uma_zalloc_pcpu(pcpu_zone_4, M_WAITOK | M_ZERO); - mp->mnt_writeopcount_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + mp->mnt_writeopcount_pcpu = uma_zalloc_pcpu(pcpu_zone_4, M_WAITOK | M_ZERO); mp->mnt_ref = 0; mp->mnt_vfs_ops = 1; mp->mnt_rootvnode = NULL; return (0); } static void mount_fini(void *mem, int size) { struct mount *mp; mp = (struct mount *)mem; - uma_zfree_pcpu(pcpu_zone_int, mp->mnt_writeopcount_pcpu); - uma_zfree_pcpu(pcpu_zone_int, mp->mnt_lockref_pcpu); - uma_zfree_pcpu(pcpu_zone_int, mp->mnt_ref_pcpu); - uma_zfree_pcpu(pcpu_zone_int, mp->mnt_thread_in_ops_pcpu); + uma_zfree_pcpu(pcpu_zone_4, mp->mnt_writeopcount_pcpu); + uma_zfree_pcpu(pcpu_zone_4, mp->mnt_lockref_pcpu); + uma_zfree_pcpu(pcpu_zone_4, mp->mnt_ref_pcpu); + uma_zfree_pcpu(pcpu_zone_4, mp->mnt_thread_in_ops_pcpu); lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); mtx_destroy(&mp->mnt_mtx); } static void vfs_mount_init(void *dummy __unused) { mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); } SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } void vfs_deleteopt(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt, *temp; if (opts == NULL) return; TAILQ_FOREACH_SAFE(opt, opts, link, temp) { if (strcmp(opt->name, name) == 0) vfs_freeopt(opts, opt); } } static int vfs_isopt_ro(const char *opt) { if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || strcmp(opt, "norw") == 0) return (1); return (0); } static int vfs_isopt_rw(const char *opt) { if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) return (1); return (0); } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { char *p; /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); while ((p = strchr(opt1, '.')) != NULL && !strncmp(opt1, opt2, ++p - opt1)) { opt2 += p - opt1; opt1 = p; /* "foo.noopt" vs. "foo.opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "foo.opt" vs. "foo.noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); } /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurrence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused, namelen, optlen; unsigned int i, iovcnt; int error; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; opt->pos = i / 2; opt->seen = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (namelen == 0 || opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. * * XXX: This function will keep a "nofoo" option in the new * options. E.g, if the option's canonical name is "foo", * "nofoo" ends up in the mount point's active options. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) { struct vfsopt *opt, *new; TAILQ_FOREACH(opt, oldopts, link) { new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = strdup(opt->name, M_MOUNT); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else new->value = NULL; new->len = opt->len; new->seen = opt->seen; TAILQ_INSERT_HEAD(toopts, new, link); } vfs_sanitizeopts(toopts); } /* * Mount a filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct nmount_args { struct iovec *iovp; unsigned int iovcnt; int flags; }; #endif int sys_nmount(struct thread *td, struct nmount_args *uap) { struct uio *auio; int error; u_int iovcnt; uint64_t flags; /* * Mount flags are now 64-bits. On 32-bit archtectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, uap->iovp, uap->iovcnt, flags); /* * Filter out MNT_ROOTFS. We do not want clients of nmount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) { CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, uap->iovcnt); return (EINVAL); } error = copyinuio(uap->iovp, iovcnt, &auio); if (error) { CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", __func__, error); return (error); } error = vfs_donmount(td, flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ void vfs_ref(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { vfs_mp_count_add_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); } void vfs_rel(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); } /* * Allocate and initialize the mount point struct. */ struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct ucred *cred) { struct mount *mp; mp = uma_zalloc(mount_zone, M_WAITOK); bzero(&mp->mnt_startzero, __rangeof(struct mount, mnt_startzero, mnt_endzero)); TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; TAILQ_INIT(&mp->mnt_lazyvnodelist); mp->mnt_lazyvnodelistsize = 0; if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 || mp->mnt_writeopcount != 0) panic("%s: non-zero counters on new mp %p\n", __func__, mp); if (mp->mnt_vfs_ops != 1) panic("%s: vfs_ops should be 1 but %d found\n", __func__, mp->mnt_vfs_ops); (void) vfs_busy(mp, MBF_NOWAIT); atomic_add_acq_int(&vfsp->vfc_refcount, 1); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_gen++; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(cred); mp->mnt_stat.f_owner = cred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_mount_init(mp); mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); TAILQ_INIT(&mp->mnt_uppers); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ void vfs_mount_destroy(struct mount *mp) { if (mp->mnt_vfs_ops == 0) panic("%s: entered with zero vfs_ops\n", __func__); vfs_assert_mount_counters(mp); MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } while (mp->mnt_ref) msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); KASSERT(mp->mnt_ref == 0, ("%s: invalid refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_writeopcount != 0) panic("vfs_mount_destroy: nonzero writeopcount"); if (mp->mnt_secondary_writes != 0) panic("vfs_mount_destroy: nonzero secondary_writes"); atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) vn_printf(vp, "dangling vnode "); panic("unmount: dangling vnode"); } KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); if (mp->mnt_lazyvnodelistsize != 0) panic("vfs_mount_destroy: nonzero lazyvnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); if (mp->mnt_vfs_ops != 1) panic("%s: vfs_ops should be 1 but %d found\n", __func__, mp->mnt_vfs_ops); if (mp->mnt_rootvnode != NULL) panic("%s: mount point still has a root vnode %p\n", __func__, mp->mnt_rootvnode); if (mp->mnt_vnodecovered != NULL) vrele(mp->mnt_vnodecovered); #ifdef MAC mac_mount_destroy(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } static bool vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error) { /* This is an upgrade of an exisiting mount. */ if ((fsflags & MNT_UPDATE) != 0) return (false); /* This is already an R/O mount. */ if ((fsflags & MNT_RDONLY) != 0) return (false); switch (error) { case ENODEV: /* generic, geom, ... */ case EACCES: /* cam/scsi, ... */ case EROFS: /* md, mmcsd, ... */ /* * These errors can be returned by the storage layer to signal * that the media is read-only. No harm in the R/O mount * attempt if the error was returned for some other reason. */ return (true); default: return (false); } } int vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; struct vfsopt *opt, *tmp_opt; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; bool autoro; errmsg = fspath = NULL; errmsg_len = fspathlen = 0; errmsg_pos = -1; autoro = default_autoro; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { int do_freeopt = 0; if (strcmp(opt->name, "update") == 0) { fsflags |= MNT_UPDATE; do_freeopt = 1; } else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) { fsflags |= MNT_FORCE; do_freeopt = 1; } else if (strcmp(opt->name, "reload") == 0) { fsflags |= MNT_RELOAD; do_freeopt = 1; } else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; else if (strcmp(opt->name, "noasync") == 0) fsflags &= ~MNT_ASYNC; else if (strcmp(opt->name, "noatime") == 0) fsflags |= MNT_NOATIME; else if (strcmp(opt->name, "atime") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoatime", M_MOUNT); } else if (strcmp(opt->name, "noclusterr") == 0) fsflags |= MNT_NOCLUSTERR; else if (strcmp(opt->name, "clusterr") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterr", M_MOUNT); } else if (strcmp(opt->name, "noclusterw") == 0) fsflags |= MNT_NOCLUSTERW; else if (strcmp(opt->name, "clusterw") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterw", M_MOUNT); } else if (strcmp(opt->name, "noexec") == 0) fsflags |= MNT_NOEXEC; else if (strcmp(opt->name, "exec") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoexec", M_MOUNT); } else if (strcmp(opt->name, "nosuid") == 0) fsflags |= MNT_NOSUID; else if (strcmp(opt->name, "suid") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosuid", M_MOUNT); } else if (strcmp(opt->name, "nosymfollow") == 0) fsflags |= MNT_NOSYMFOLLOW; else if (strcmp(opt->name, "symfollow") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosymfollow", M_MOUNT); } else if (strcmp(opt->name, "noro") == 0) { fsflags &= ~MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "rw") == 0) { fsflags &= ~MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "ro") == 0) { fsflags |= MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "rdonly") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("ro", M_MOUNT); fsflags |= MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "autoro") == 0) { do_freeopt = 1; autoro = true; } else if (strcmp(opt->name, "suiddir") == 0) fsflags |= MNT_SUIDDIR; else if (strcmp(opt->name, "sync") == 0) fsflags |= MNT_SYNCHRONOUS; else if (strcmp(opt->name, "union") == 0) fsflags |= MNT_UNION; else if (strcmp(opt->name, "automounted") == 0) { fsflags |= MNT_AUTOMOUNTED; do_freeopt = 1; } else if (strcmp(opt->name, "nocover") == 0) { fsflags |= MNT_NOCOVER; do_freeopt = 1; } else if (strcmp(opt->name, "cover") == 0) { fsflags &= ~MNT_NOCOVER; do_freeopt = 1; } else if (strcmp(opt->name, "emptydir") == 0) { fsflags |= MNT_EMPTYDIR; do_freeopt = 1; } else if (strcmp(opt->name, "noemptydir") == 0) { fsflags &= ~MNT_EMPTYDIR; do_freeopt = 1; } if (do_freeopt) vfs_freeopt(optlist, opt); } /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { error = ENAMETOOLONG; goto bail; } error = vfs_domount(td, fstype, fspath, fsflags, &optlist); /* * See if we can mount in the read-only mode if the error code suggests * that it could be possible and the mount options allow for that. * Never try it if "[no]{ro|rw}" has been explicitly requested and not * overridden by "autoro". */ if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) { printf("%s: R/W mount failed, possibly R/O media," " trying R/O mount\n", __func__); fsflags |= MNT_RDONLY; error = vfs_domount(td, fstype, fspath, fsflags, &optlist); } bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { bcopy(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copyout(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } } if (optlist != NULL) vfs_freeopts(optlist); return (error); } /* * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int sys_mount(struct thread *td, struct mount_args *uap) { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; uint64_t flags; int error; /* * Mount flags are now 64-bits. On 32-bit architectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); /* * Filter out MNT_ROOTFS. We do not want clients of mount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (error) { free(fstype, M_TEMP); return (error); } AUDIT_ARG_TEXT(fstype); vfsp = vfs_byname_kld(fstype, td, &error); free(fstype, M_TEMP); if (vfsp == NULL) return (ENOENT); if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 && vfsp->vfc_vfsops_sd->vfs_cmount == NULL) || ((vfsp->vfc_flags & VFCF_SBDRY) == 0 && vfsp->vfc_vfsops->vfs_cmount == NULL)) return (EOPNOTSUPP); ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); if ((vfsp->vfc_flags & VFCF_SBDRY) != 0) return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags)); return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags)); } /* * vfs_domount_first(): first file system mount (not update) */ static int vfs_domount_first( struct thread *td, /* Calling thread. */ struct vfsconf *vfsp, /* File system type. */ char *fspath, /* Mount path. */ struct vnode *vp, /* Vnode to be covered. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vattr va; struct mount *mp; struct vnode *newdp, *rootvp; int error, error1; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); if ((fsflags & MNT_EMPTYDIR) != 0) { error = vfs_emptydir(vp); if (error != 0) { vput(vp); return (error); } } /* * If the jail of the calling thread lacks permission for this type of * file system, deny immediately. */ if (jailed(td->td_ucred) && !prison_allow(td->td_ucred, vfsp->vfc_prison_flag)) { vput(vp); return (EPERM); } /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred); if (error == 0 && va.va_uid != td->td_ucred->cr_uid) error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN); if (error == 0) error = vinvalbuf(vp, V_SAVE, 0, 0); if (error == 0 && vp->v_type != VDIR) error = ENOTDIR; if (error == 0) { VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) vp->v_iflag |= VI_MOUNT; else error = EBUSY; VI_UNLOCK(vp); } if (error != 0) { vput(vp); return (error); } vn_seqc_write_begin(vp); VOP_UNLOCK(vp); /* Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = *optlist; /* Set the mount level flags. */ mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY)); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error1 = 0; if ((error = VFS_MOUNT(mp)) != 0 || (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 || (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) { rootvp = NULL; if (error1 != 0) { error = error1; rootvp = vfs_cache_root_clear(mp); if (rootvp != NULL) { vhold(rootvp); vrele(rootvp); } if ((error1 = VFS_UNMOUNT(mp, 0)) != 0) printf("VFS_UNMOUNT returned %d\n", error1); } vfs_unbusy(mp); mp->mnt_vnodecovered = NULL; vfs_mount_destroy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vdrop(rootvp); } vn_seqc_write_end(vp); vrele(vp); return (error); } vn_seqc_write_begin(newdp); VOP_UNLOCK(newdp); if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; /* * Prevent external consumers of mount options from reading mnt_optnew. */ mp->mnt_optnew = NULL; MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); cache_purge(vp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; /* Place the new filesystem at the end of the mount list. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY); VOP_UNLOCK(vp); EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td); VOP_UNLOCK(newdp); mount_devctl_event("MOUNT", mp, false); mountcheckdirs(vp, newdp); vn_seqc_write_end(vp); vn_seqc_write_end(newdp); vrele(newdp); if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); vfs_op_exit(mp); vfs_unbusy(mp); return (0); } /* * vfs_domount_update(): update of mounted file system */ static int vfs_domount_update( struct thread *td, /* Calling thread. */ struct vnode *vp, /* Mount point vnode. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct export_args export; struct o2export_args o2export; struct vnode *rootvp; void *bufp; struct mount *mp; int error, export_error, i, len; uint64_t flag; gid_t *grps; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); mp = vp->v_mount; if ((vp->v_vflag & VV_ROOT) == 0) { if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) == 0) error = EXDEV; else error = EINVAL; vput(vp); return (error); } /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ flag = mp->mnt_flag; if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { vput(vp); return (EOPNOTSUPP); /* Needs translation */ } /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error != 0) { vput(vp); return (error); } if (vfs_busy(mp, MBF_NOWAIT)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); VOP_UNLOCK(vp); vfs_op_enter(mp); vn_seqc_write_begin(vp); rootvp = NULL; MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { MNT_IUNLOCK(mp); error = EBUSY; goto end; } mp->mnt_flag &= ~MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; rootvp = vfs_cache_root_clear(mp); MNT_IUNLOCK(mp); mp->mnt_optnew = *optlist; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp); export_error = 0; /* Process the export option. */ if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, &len) == 0) { /* Assume that there is only 1 ABI for each length. */ switch (len) { case (sizeof(struct oexport_args)): bzero(&o2export, sizeof(o2export)); /* FALLTHROUGH */ case (sizeof(o2export)): bcopy(bufp, &o2export, len); export.ex_flags = (uint64_t)o2export.ex_flags; export.ex_root = o2export.ex_root; export.ex_uid = o2export.ex_anon.cr_uid; export.ex_groups = NULL; export.ex_ngroups = o2export.ex_anon.cr_ngroups; if (export.ex_ngroups > 0) { if (export.ex_ngroups <= XU_NGROUPS) { export.ex_groups = malloc( export.ex_ngroups * sizeof(gid_t), M_TEMP, M_WAITOK); for (i = 0; i < export.ex_ngroups; i++) export.ex_groups[i] = o2export.ex_anon.cr_groups[i]; } else export_error = EINVAL; } else if (export.ex_ngroups < 0) export_error = EINVAL; export.ex_addr = o2export.ex_addr; export.ex_addrlen = o2export.ex_addrlen; export.ex_mask = o2export.ex_mask; export.ex_masklen = o2export.ex_masklen; export.ex_indexfile = o2export.ex_indexfile; export.ex_numsecflavors = o2export.ex_numsecflavors; if (export.ex_numsecflavors < MAXSECFLAVORS) { for (i = 0; i < export.ex_numsecflavors; i++) export.ex_secflavors[i] = o2export.ex_secflavors[i]; } else export_error = EINVAL; if (export_error == 0) export_error = vfs_export(mp, &export); free(export.ex_groups, M_TEMP); break; case (sizeof(export)): bcopy(bufp, &export, len); grps = NULL; if (export.ex_ngroups > 0) { if (export.ex_ngroups <= NGROUPS_MAX) { grps = malloc(export.ex_ngroups * sizeof(gid_t), M_TEMP, M_WAITOK); export_error = copyin(export.ex_groups, grps, export.ex_ngroups * sizeof(gid_t)); if (export_error == 0) export.ex_groups = grps; } else export_error = EINVAL; } else if (export.ex_ngroups == 0) export.ex_groups = NULL; else export_error = EINVAL; if (export_error == 0) export_error = vfs_export(mp, &export); free(grps, M_TEMP); break; default: export_error = EINVAL; break; } } MNT_ILOCK(mp); if (error == 0) { mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); } else { /* * If we fail, restore old mount flags. MNT_QUOTA is special, * because it is not part of MNT_UPDATEMASK, but it could have * changed in the meantime if quotactl(2) was called. * All in all we want current value of MNT_QUOTA, not the old * one. */ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); } if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if (error != 0) goto end; mount_devctl_event("REMOUNT", mp, true); if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; (void)VFS_STATFS(mp, &mp->mnt_stat); /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); else vfs_deallocate_syncvnode(mp); end: vfs_op_exit(mp); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vrele(rootvp); } vn_seqc_write_end(vp); vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error != 0 ? error : export_error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Calling thread. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vfsconf *vfsp; struct nameidata nd; struct vnode *vp; char *pathbuf; int error; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred) || usermount == 0) { if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & MNT_EXPORTED) { error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); if (error) return (error); } if (fsflags & MNT_SUIDDIR) { error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. */ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) fsflags |= MNT_NOSUID | MNT_USER; } /* Load KLDs before we lock the covered vnode to avoid reversals. */ vfsp = NULL; if ((fsflags & MNT_UPDATE) == 0) { /* Don't try to load KLDs if we're mounting the root. */ if (fsflags & MNT_ROOTFS) vfsp = vfs_byname(fstype); else vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); } /* * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if ((fsflags & MNT_UPDATE) == 0) { if ((vp->v_vflag & VV_ROOT) != 0 && (fsflags & MNT_NOCOVER) != 0) { vput(vp); return (EBUSY); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); strcpy(pathbuf, fspath); error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN); if (error == 0) { error = vfs_domount_first(td, vfsp, pathbuf, vp, fsflags, optlist); } free(pathbuf, M_TEMP); } else error = vfs_domount_update(td, vp, fsflags, optlist); return (error); } /* * Unmount a filesystem. * * Note: unmount takes a path to the vnode mounted on as argument, not * special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int sys_unmount(struct thread *td, struct unmount_args *uap) { return (kern_unmount(td, uap->path, uap->flags)); } int kern_unmount(struct thread *td, const char *path, int flags) { struct nameidata nd; struct mount *mp; char *pathbuf; int error, id0, id1; AUDIT_ARG_VALUE(flags); if (jailed(td->td_ucred) || usermount == 0) { error = priv_check(td, PRIV_VFS_UNMOUNT); if (error) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } if (flags & MNT_BYFSID) { AUDIT_ARG_TEXT(pathbuf); /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } else { /* * Try to find global path for path argument. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, pathbuf, td); if (namei(&nd) == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, MNAMELEN); if (error == 0) vput(nd.ni_vp); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ return ((flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) { vfs_rel(mp); return (EINVAL); } error = dounmount(mp, flags, td); return (error); } /* * Return error if any of the vnodes, ignoring the root vnode * and the syncer vnode, have non-zero usecount. * * This function is purely advisory - it can return false positives * and negatives. */ static int vfs_check_usecounts(struct mount *mp) { struct vnode *vp, *mvp; MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && vp->v_usecount != 0) { VI_UNLOCK(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (EBUSY); } VI_UNLOCK(vp); } return (0); } static void dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags) { mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag &= ~mntkflags; if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); if (coveredvp != NULL) { VOP_UNLOCK(coveredvp); vdrop(coveredvp); } vn_finished_write(mp); } /* * There are various reference counters associated with the mount point. * Normally it is permitted to modify them without taking the mnt ilock, * but this behavior can be temporarily disabled if stable value is needed * or callers are expected to block (e.g. to not allow new users during * forced unmount). */ void vfs_op_enter(struct mount *mp) { int cpu; MNT_ILOCK(mp); mp->mnt_vfs_ops++; if (mp->mnt_vfs_ops > 1) { MNT_IUNLOCK(mp); return; } vfs_op_barrier_wait(mp); CPU_FOREACH(cpu) { mp->mnt_ref += zpcpu_replace_cpu(mp->mnt_ref_pcpu, 0, cpu); mp->mnt_lockref += zpcpu_replace_cpu(mp->mnt_lockref_pcpu, 0, cpu); mp->mnt_writeopcount += zpcpu_replace_cpu(mp->mnt_writeopcount_pcpu, 0, cpu); } if (mp->mnt_ref <= 0 || mp->mnt_lockref < 0 || mp->mnt_writeopcount < 0) panic("%s: invalid count(s) on mp %p: ref %d lockref %d writeopcount %d\n", __func__, mp, mp->mnt_ref, mp->mnt_lockref, mp->mnt_writeopcount); MNT_IUNLOCK(mp); vfs_assert_mount_counters(mp); } void vfs_op_exit_locked(struct mount *mp) { mtx_assert(MNT_MTX(mp), MA_OWNED); if (mp->mnt_vfs_ops <= 0) panic("%s: invalid vfs_ops count %d for mp %p\n", __func__, mp->mnt_vfs_ops, mp); mp->mnt_vfs_ops--; } void vfs_op_exit(struct mount *mp) { MNT_ILOCK(mp); vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); } struct vfs_op_barrier_ipi { struct mount *mp; struct smp_rendezvous_cpus_retry_arg srcra; }; static void vfs_op_action_func(void *arg) { struct vfs_op_barrier_ipi *vfsopipi; struct mount *mp; vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); mp = vfsopipi->mp; if (!vfs_op_thread_entered(mp)) smp_rendezvous_cpus_done(arg); } static void vfs_op_wait_func(void *arg, int cpu) { struct vfs_op_barrier_ipi *vfsopipi; struct mount *mp; int *in_op; vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); mp = vfsopipi->mp; in_op = zpcpu_get_cpu(mp->mnt_thread_in_ops_pcpu, cpu); while (atomic_load_int(in_op)) cpu_spinwait(); } void vfs_op_barrier_wait(struct mount *mp) { struct vfs_op_barrier_ipi vfsopipi; vfsopipi.mp = mp; smp_rendezvous_cpus_retry(all_cpus, smp_no_rendezvous_barrier, vfs_op_action_func, smp_no_rendezvous_barrier, vfs_op_wait_func, &vfsopipi.srcra); } #ifdef DIAGNOSTIC void vfs_assert_mount_counters(struct mount *mp) { int cpu; if (mp->mnt_vfs_ops == 0) return; CPU_FOREACH(cpu) { if (*zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu) != 0 || *zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu) != 0 || *zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu) != 0) vfs_dump_mount_counters(mp); } } void vfs_dump_mount_counters(struct mount *mp) { int cpu, *count; int ref, lockref, writeopcount; printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops); printf(" ref : "); ref = mp->mnt_ref; CPU_FOREACH(cpu) { count = zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu); printf("%d ", *count); ref += *count; } printf("\n"); printf(" lockref : "); lockref = mp->mnt_lockref; CPU_FOREACH(cpu) { count = zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu); printf("%d ", *count); lockref += *count; } printf("\n"); printf("writeopcount: "); writeopcount = mp->mnt_writeopcount; CPU_FOREACH(cpu) { count = zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu); printf("%d ", *count); writeopcount += *count; } printf("\n"); printf("counter struct total\n"); printf("ref %-5d %-5d\n", mp->mnt_ref, ref); printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref); printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount); panic("invalid counts on struct mount"); } #endif int vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) { int *base, *pcpu; int cpu, sum; switch (which) { case MNT_COUNT_REF: base = &mp->mnt_ref; pcpu = mp->mnt_ref_pcpu; break; case MNT_COUNT_LOCKREF: base = &mp->mnt_lockref; pcpu = mp->mnt_lockref_pcpu; break; case MNT_COUNT_WRITEOPCOUNT: base = &mp->mnt_writeopcount; pcpu = mp->mnt_writeopcount_pcpu; break; } sum = *base; CPU_FOREACH(cpu) { sum += *zpcpu_get_cpu(pcpu, cpu); } return (sum); } /* * Do the actual filesystem unmount. */ int dounmount(struct mount *mp, int flags, struct thread *td) { struct vnode *coveredvp, *rootvp; int error; uint64_t async_flag; int mnt_gen_r; if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); vholdl(coveredvp); vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); /* * Check for mp being unmounted while waiting for the * covered vnode lock. */ if (coveredvp->v_mountedhere != mp || coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { VOP_UNLOCK(coveredvp); vdrop(coveredvp); vfs_rel(mp); return (EBUSY); } } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error != 0) { if (coveredvp != NULL) { VOP_UNLOCK(coveredvp); vdrop(coveredvp); } vfs_rel(mp); return (error); } vfs_op_enter(mp); vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || (mp->mnt_flag & MNT_UPDATE) != 0 || !TAILQ_EMPTY(&mp->mnt_uppers)) { dounmount_cleanup(mp, coveredvp, 0); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT; rootvp = vfs_cache_root_clear(mp); if (coveredvp != NULL) vn_seqc_write_begin(coveredvp); if (flags & MNT_NONBUSY) { MNT_IUNLOCK(mp); error = vfs_check_usecounts(mp); MNT_ILOCK(mp); if (error != 0) { vn_seqc_write_end(coveredvp); dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vrele(rootvp); } return (error); } } /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) { mp->mnt_kern_flag |= MNTK_UNMOUNTF; MNT_IUNLOCK(mp); /* * Must be done after setting MNTK_UNMOUNTF and before * waiting for mnt_lockref to become 0. */ VFS_PURGE(mp); MNT_ILOCK(mp); } error = 0; if (mp->mnt_lockref) { mp->mnt_kern_flag |= MNTK_DRAINING; error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, "mount drain", 0); } MNT_IUNLOCK(mp); KASSERT(mp->mnt_lockref == 0, ("%s: invalid lock refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); KASSERT(error == 0, ("%s: invalid return value for msleep in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); /* * We want to keep the vnode around so that we can vn_seqc_write_end * after we are done with unmount. Downgrade our reference to a mere * hold count so that we don't interefere with anything. */ if (rootvp != NULL) { vhold(rootvp); vrele(rootvp); } if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); vfs_periodic(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_deallocate_syncvnode(mp); error = VFS_UNMOUNT(mp, flags); vn_finished_write(mp); /* * If we failed to flush the dirty blocks for this mount point, * undo all the cdir/rdir and rootvnode changes we made above. * Unless we failed to do so because the device is reporting that * it doesn't exist anymore. */ if (error && error != ENXIO) { MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0) { MNT_IUNLOCK(mp); vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); } mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); if (coveredvp) { vn_seqc_write_end(coveredvp); VOP_UNLOCK(coveredvp); vdrop(coveredvp); } if (rootvp != NULL) { vn_seqc_write_end(rootvp); vdrop(rootvp); } return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; vn_seqc_write_end(coveredvp); VOP_UNLOCK(coveredvp); vdrop(coveredvp); } mount_devctl_event("UNMOUNT", mp, false); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vdrop(rootvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); if (rootvnode != NULL && mp == rootvnode->v_mount) { vrele(rootvnode); rootvnode = NULL; } if (mp == rootdevmp) rootdevmp = NULL; vfs_mount_destroy(mp); return (0); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } void vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) { va_list ap; int error, len; char *errmsg; error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; char errmsg[255]; const char **t, *p, *q; int ret = 0; TAILQ_FOREACH(opt, opts, link) { p = opt->name; q = NULL; if (p[0] == 'n' && p[1] == 'o') q = p + 2; for(t = global_opts; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; for(t = legal; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; snprintf(errmsg, sizeof(errmsg), "mount option <%s> is unknown", p); ret = EINVAL; } if (ret != 0) { TAILQ_FOREACH(opt, opts, link) { if (strcmp(opt->name, "errmsg") == 0) { strncpy((char *)opt->value, errmsg, opt->len); break; } } if (opt == NULL) printf("%s\n", errmsg); } return (ret); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len) { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; if (opts == NULL) return (-1); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; return (opt->pos); } } return (-1); } int vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) { char *opt_value, *vtp; quad_t iv; int error, opt_len; error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); if (error != 0) return (error); if (opt_len == 0 || opt_value == NULL) return (EINVAL); if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') return (EINVAL); iv = strtoq(opt_value, &vtp, 0); if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) return (EINVAL); if (iv < 0) return (EINVAL); switch (vtp[0]) { case 't': case 'T': iv *= 1024; /* FALLTHROUGH */ case 'g': case 'G': iv *= 1024; /* FALLTHROUGH */ case 'm': case 'M': iv *= 1024; /* FALLTHROUGH */ case 'k': case 'K': iv *= 1024; case '\0': break; default: return (EINVAL); } *value = iv; return (0); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || ((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } *error = ENOENT; return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, uint64_t val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || opt->value == NULL) return (0); if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } int vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len != len) return (EINVAL); bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len < len) return (EINVAL); opt->len = len; bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = strlen(value) + 1; else if (strlcpy(opt->value, value, opt->len) >= opt->len) return (EINVAL); return (0); } return (ENOENT); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len) { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } int __vfs_statfs(struct mount *mp, struct statfs *sbp) { /* * Filesystems only fill in part of the structure for updates, we * have to read the entirety first to get all content. */ if (sbp != &mp->mnt_stat) memcpy(sbp, &mp->mnt_stat, sizeof(*sbp)); /* * Set these in case the underlying filesystem fails to do so. */ sbp->f_version = STATFS_VERSION; sbp->f_namemax = NAME_MAX; sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; return (mp->mnt_op->vfs_statfs(mp, sbp)); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, treat value as a C string. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, uint64_t flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0)); } va_end(ap); error = kernel_mount(ma, flags); return (error); } /* Map from mount options to printable formats. */ static struct mntoptnames optnames[] = { MNTOPT_NAMES }; static void mount_devctl_event_mntopt(struct sbuf *sb, const char *what, struct vfsoptlist *opts) { struct vfsopt *opt; if (opts == NULL || TAILQ_EMPTY(opts)) return; sbuf_printf(sb, " %s=\"", what); TAILQ_FOREACH(opt, opts, link) { if (opt->name[0] == '\0' || (opt->len > 0 && *(char *)opt->value == '\0')) continue; devctl_safe_quote_sb(sb, opt->name); if (opt->len > 0) { sbuf_putc(sb, '='); devctl_safe_quote_sb(sb, opt->value); } sbuf_putc(sb, ';'); } sbuf_putc(sb, '"'); } #define DEVCTL_LEN 1024 static void mount_devctl_event(const char *type, struct mount *mp, bool donew) { const uint8_t *cp; struct mntoptnames *fp; struct sbuf sb; struct statfs *sfp = &mp->mnt_stat; char *buf; buf = malloc(DEVCTL_LEN, M_MOUNT, M_NOWAIT); if (buf == NULL) return; sbuf_new(&sb, buf, DEVCTL_LEN, SBUF_FIXEDLEN); sbuf_cpy(&sb, "mount-point=\""); devctl_safe_quote_sb(&sb, sfp->f_mntonname); sbuf_cat(&sb, "\" mount-dev=\""); devctl_safe_quote_sb(&sb, sfp->f_mntfromname); sbuf_cat(&sb, "\" mount-type=\""); devctl_safe_quote_sb(&sb, sfp->f_fstypename); sbuf_cat(&sb, "\" fsid=0x"); cp = (const uint8_t *)&sfp->f_fsid.val[0]; for (int i = 0; i < sizeof(sfp->f_fsid); i++) sbuf_printf(&sb, "%02x", cp[i]); sbuf_printf(&sb, " owner=%u flags=\"", sfp->f_owner); for (fp = optnames; fp->o_opt != 0; fp++) { if ((mp->mnt_flag & fp->o_opt) != 0) { sbuf_cat(&sb, fp->o_name); sbuf_putc(&sb, ';'); } } sbuf_putc(&sb, '"'); mount_devctl_event_mntopt(&sb, "opt", mp->mnt_opt); if (donew) mount_devctl_event_mntopt(&sb, "optnew", mp->mnt_optnew); sbuf_finish(&sb); if (sbuf_error(&sb) == 0) devctl_notify("VFS", "FS", type, sbuf_data(&sb)); sbuf_delete(&sb); free(buf, M_MOUNT); } Index: head/sys/sys/param.h =================================================================== --- head/sys/sys/param.h (revision 367383) +++ head/sys/sys/param.h (revision 367384) @@ -1,370 +1,370 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * X.0-CURRENT before releng/X.0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1300125 /* Master, propagated to newvers */ +#define __FreeBSD_version 1300126 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_WRFSBASE 1200041 #define P_OSREL_CK_CYLGRP 1200046 #define P_OSREL_VMTOTAL64 1200054 #define P_OSREL_CK_SUPERBLOCK 1300000 #define P_OSREL_CK_INODE 1300005 #define P_OSREL_POWERPC_NEW_AUX_ARGS 1300070 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 255 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL #ifndef LOCORE /* Signals. */ #include #endif #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ Index: head/sys/vm/uma.h =================================================================== --- head/sys/vm/uma.h (revision 367383) +++ head/sys/vm/uma.h (revision 367384) @@ -1,732 +1,732 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ /* * uma.h - External definitions for the Universal Memory Allocator * */ #ifndef _VM_UMA_H_ #define _VM_UMA_H_ #include /* For NULL */ #include /* For M_* */ #include /* User visible parameters */ #define UMA_SMALLEST_UNIT 8 /* Smallest item allocated */ /* Types and type defs */ struct uma_zone; /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; /* * Item constructor * * Arguments: * item A pointer to the memory which has been allocated. * arg The arg field passed to uma_zalloc_arg * size The size of the allocated item * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The constructor is called just before the memory is returned * to the user. It may block if necessary. */ typedef int (*uma_ctor)(void *mem, int size, void *arg, int flags); /* * Item destructor * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being destructed. * arg Argument passed through uma_zfree_arg * * Returns: * Nothing * * Discussion: * The destructor may perform operations that differ from those performed * by the initializer, but it must leave the object in the same state. * This IS type stable storage. This is called after EVERY zfree call. */ typedef void (*uma_dtor)(void *mem, int size, void *arg); /* * Item initializer * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being initialized. * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The initializer is called when the memory is cached in the uma zone. * The initializer and the destructor should leave the object in the same * state. */ typedef int (*uma_init)(void *mem, int size, int flags); /* * Item discard function * * Arguments: * item A pointer to memory which has been 'freed' but has not left the * zone's cache. * size The size of the item being discarded. * * Returns: * Nothing * * Discussion: * This routine is called when memory leaves a zone and is returned to the * system for other uses. It is the counter-part to the init function. */ typedef void (*uma_fini)(void *mem, int size); /* * Import new memory into a cache zone. */ typedef int (*uma_import)(void *arg, void **store, int count, int domain, int flags); /* * Free memory from a cache zone. */ typedef void (*uma_release)(void *arg, void **store, int count); /* * What's the difference between initializing and constructing? * * The item is initialized when it is cached, and this is the state that the * object should be in when returned to the allocator. The purpose of this is * to remove some code which would otherwise be called on each allocation by * utilizing a known, stable state. This differs from the constructor which * will be called on EVERY allocation. * * For example, in the initializer you may want to initialize embedded locks, * NULL list pointers, set up initial states, magic numbers, etc. This way if * the object is held in the allocator and re-used it won't be necessary to * re-initialize it. * * The constructor may be used to lock a data structure, link it on to lists, * bump reference counts or total counts of outstanding structures, etc. * */ /* Function proto types */ /* * Create a new uma zone * * Arguments: * name The text name of the zone for debugging and stats. This memory * should not be freed until the zone has been deallocated. * size The size of the object that is being created. * ctor The constructor that is called when the object is allocated. * dtor The destructor that is called when the object is freed. * init An initializer that sets up the initial state of the memory. * fini A discard function that undoes initialization done by init. * ctor/dtor/init/fini may all be null, see notes above. * align A bitmask that corresponds to the requested alignment * eg 4 would be 0x3 * flags A set of parameters that control the behavior of the zone. * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, uint32_t flags); /* * Create a secondary uma zone * * Arguments: * name The text name of the zone for debugging and stats. This memory * should not be freed until the zone has been deallocated. * ctor The constructor that is called when the object is allocated. * dtor The destructor that is called when the object is freed. * zinit An initializer that sets up the initial state of the memory * as the object passes from the Keg's slab to the Zone's cache. * zfini A discard function that undoes initialization done by init * as the object passes from the Zone's cache to the Keg's slab. * * ctor/dtor/zinit/zfini may all be null, see notes above. * Note that the zinit and zfini specified here are NOT * exactly the same as the init/fini specified to uma_zcreate() * when creating a primary zone. These zinit/zfini are called * on the TRANSITION from keg to zone (and vice-versa). Once * these are set, the primary zone may alter its init/fini * (which are called when the object passes from VM to keg) * using uma_zone_set_init/fini()) as well as its own * zinit/zfini (unset by default for primary zone) with * uma_zone_set_zinit/zfini() (note subtle 'z' prefix). * * primary A reference to this zone's Primary Zone which contains the * backing Keg for the Secondary Zone being added. * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t primary); /* * Create cache-only zones. * * This allows uma's per-cpu cache facilities to handle arbitrary * pointers. Consumers must specify the import and release functions to * fill and destroy caches. UMA does not allocate any memory for these * zones. The 'arg' parameter is passed to import/release and is caller * specific. */ uma_zone_t uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, void *arg, int flags); /* * Definitions for uma_zcreate flags * * These flags share space with UMA_ZFLAGs in uma_int.h. Be careful not to * overlap when adding new features. */ #define UMA_ZONE_ZINIT 0x0002 /* Initialize with zeros */ #define UMA_ZONE_CONTIG 0x0004 /* * Physical memory underlying an object * must be contiguous. */ #define UMA_ZONE_NOTOUCH 0x0008 /* UMA may not access the memory */ #define UMA_ZONE_MALLOC 0x0010 /* For use by malloc(9) only! */ #define UMA_ZONE_NOFREE 0x0020 /* Do not free slabs of this type! */ #define UMA_ZONE_MTXCLASS 0x0040 /* Create a new lock class */ #define UMA_ZONE_VM 0x0080 /* * Used for internal vm datastructures * only. */ #define UMA_ZONE_NOTPAGE 0x0100 /* allocf memory not vm pages */ #define UMA_ZONE_SECONDARY 0x0200 /* Zone is a Secondary Zone */ #define UMA_ZONE_NOBUCKET 0x0400 /* Do not use buckets. */ #define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets. */ #define UMA_ZONE_MINBUCKET 0x1000 /* Use smallest buckets. */ #define UMA_ZONE_CACHESPREAD 0x2000 /* * Spread memory start locations across * all possible cache lines. May * require many virtually contiguous * backend pages and can fail early. */ #define UMA_ZONE_NODUMP 0x4000 /* * Zone's pages will not be included in * mini-dumps. */ #define UMA_ZONE_PCPU 0x8000 /* * Allocates mp_maxid + 1 slabs of * PAGE_SIZE */ #define UMA_ZONE_FIRSTTOUCH 0x10000 /* First touch NUMA policy */ #define UMA_ZONE_ROUNDROBIN 0x20000 /* Round-robin NUMA policy. */ #define UMA_ZONE_SMR 0x40000 /* * Safe memory reclamation defers * frees until all read sections * have exited. This flag creates * a unique SMR context for this * zone. To share contexts see * uma_zone_set_smr() below. * * See sys/smr.h for more details. */ /* In use by UMA_ZFLAGs: 0xffe00000 */ /* * These flags are shared between the keg and zone. Some are determined * based on physical parameters of the request and may not be provided by * the consumer. */ #define UMA_ZONE_INHERIT \ (UMA_ZONE_NOTOUCH | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \ UMA_ZONE_VM | UMA_ZONE_NOTPAGE | UMA_ZONE_PCPU | \ UMA_ZONE_FIRSTTOUCH | UMA_ZONE_ROUNDROBIN) /* Definitions for align */ #define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */ #define UMA_ALIGN_LONG (sizeof(long) - 1) /* "" long */ #define UMA_ALIGN_INT (sizeof(int) - 1) /* "" int */ #define UMA_ALIGN_SHORT (sizeof(short) - 1) /* "" short */ #define UMA_ALIGN_CHAR (sizeof(char) - 1) /* "" char */ #define UMA_ALIGN_CACHE (0 - 1) /* Cache line size align */ #define UMA_ALIGNOF(type) (_Alignof(type) - 1) /* Alignment fit for 'type' */ #define UMA_ANYDOMAIN -1 /* Special value for domain search. */ /* * Destroys an empty uma zone. If the zone is not empty uma complains loudly. * * Arguments: * zone The zone we want to destroy. * */ void uma_zdestroy(uma_zone_t zone); /* * Allocates an item out of a zone * * Arguments: * zone The zone we are allocating from * arg This data is passed to the ctor function * flags See sys/malloc.h for available flags. * * Returns: * A non-null pointer to an initialized element from the zone is * guaranteed if the wait flag is M_WAITOK. Otherwise a null pointer * may be returned if the zone is empty or the ctor failed. */ void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags); /* Allocate per-cpu data. Access the correct data with zpcpu_get(). */ void *uma_zalloc_pcpu_arg(uma_zone_t zone, void *arg, int flags); /* Use with SMR zones. */ void *uma_zalloc_smr(uma_zone_t zone, int flags); /* * Allocate an item from a specific NUMA domain. This uses a slow path in * the allocator but is guaranteed to allocate memory from the requested * domain if M_WAITOK is set. * * Arguments: * zone The zone we are allocating from * arg This data is passed to the ctor function * domain The domain to allocate from. * flags See sys/malloc.h for available flags. */ void *uma_zalloc_domain(uma_zone_t zone, void *arg, int domain, int flags); /* * Allocates an item out of a zone without supplying an argument * * This is just a wrapper for uma_zalloc_arg for convenience. * */ static __inline void *uma_zalloc(uma_zone_t zone, int flags); static __inline void *uma_zalloc_pcpu(uma_zone_t zone, int flags); static __inline void * uma_zalloc(uma_zone_t zone, int flags) { return uma_zalloc_arg(zone, NULL, flags); } static __inline void * uma_zalloc_pcpu(uma_zone_t zone, int flags) { return uma_zalloc_pcpu_arg(zone, NULL, flags); } /* * Frees an item back into the specified zone. * * Arguments: * zone The zone the item was originally allocated out of. * item The memory to be freed. * arg Argument passed to the destructor * * Returns: * Nothing. */ void uma_zfree_arg(uma_zone_t zone, void *item, void *arg); /* Use with PCPU zones. */ void uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *arg); /* Use with SMR zones. */ void uma_zfree_smr(uma_zone_t zone, void *item); /* * Frees an item back to a zone without supplying an argument * * This is just a wrapper for uma_zfree_arg for convenience. * */ static __inline void uma_zfree(uma_zone_t zone, void *item); static __inline void uma_zfree_pcpu(uma_zone_t zone, void *item); static __inline void uma_zfree(uma_zone_t zone, void *item) { uma_zfree_arg(zone, item, NULL); } static __inline void uma_zfree_pcpu(uma_zone_t zone, void *item) { uma_zfree_pcpu_arg(zone, item, NULL); } /* * Wait until the specified zone can allocate an item. */ void uma_zwait(uma_zone_t zone); /* * Backend page supplier routines * * Arguments: * zone The zone that is requesting pages. * size The number of bytes being requested. * pflag Flags for these memory pages, see below. * domain The NUMA domain that we prefer for this allocation. * wait Indicates our willingness to block. * * Returns: * A pointer to the allocated memory or NULL on failure. */ typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain, uint8_t *pflag, int wait); /* * Backend page free routines * * Arguments: * item A pointer to the previously allocated pages. * size The original size of the allocation. * pflag The flags for the slab. See UMA_SLAB_* below. * * Returns: * None */ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); /* * Reclaims unused memory * * Arguments: * req Reclamation request type. * Returns: * None */ #define UMA_RECLAIM_DRAIN 1 /* release bucket cache */ #define UMA_RECLAIM_DRAIN_CPU 2 /* release bucket and per-CPU caches */ #define UMA_RECLAIM_TRIM 3 /* trim bucket cache to WSS */ void uma_reclaim(int req); void uma_zone_reclaim(uma_zone_t, int req); /* * Sets the alignment mask to be used for all zones requesting cache * alignment. Should be called by MD boot code prior to starting VM/UMA. * * Arguments: * align The alignment mask * * Returns: * Nothing */ void uma_set_align(int align); /* * Set a reserved number of items to hold for M_USE_RESERVE allocations. All * other requests must allocate new backing pages. */ void uma_zone_reserve(uma_zone_t zone, int nitems); /* * Reserves the maximum KVA space required by the zone and configures the zone * to use a VM_ALLOC_NOOBJ-based backend allocator. * * Arguments: * zone The zone to update. * nitems The upper limit on the number of items that can be allocated. * * Returns: * 0 if KVA space can not be allocated * 1 if successful * * Discussion: * When the machine supports a direct map and the zone's items are smaller * than a page, the zone will use the direct map instead of allocating KVA * space. */ int uma_zone_reserve_kva(uma_zone_t zone, int nitems); /* * Sets a high limit on the number of items allowed in a zone * * Arguments: * zone The zone to limit * nitems The requested upper limit on the number of items allowed * * Returns: * int The effective value of nitems */ int uma_zone_set_max(uma_zone_t zone, int nitems); /* * Sets a high limit on the number of items allowed in zone's bucket cache * * Arguments: * zone The zone to limit * nitems The requested upper limit on the number of items allowed */ void uma_zone_set_maxcache(uma_zone_t zone, int nitems); /* * Obtains the effective limit on the number of items in a zone * * Arguments: * zone The zone to obtain the effective limit from * * Return: * 0 No limit * int The effective limit of the zone */ int uma_zone_get_max(uma_zone_t zone); /* * Sets a warning to be printed when limit is reached * * Arguments: * zone The zone we will warn about * warning Warning content * * Returns: * Nothing */ void uma_zone_set_warning(uma_zone_t zone, const char *warning); /* * Sets a function to run when limit is reached * * Arguments: * zone The zone to which this applies * fx The function ro run * * Returns: * Nothing */ typedef void (*uma_maxaction_t)(uma_zone_t, int); void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t); /* * Obtains the approximate current number of items allocated from a zone * * Arguments: * zone The zone to obtain the current allocation count from * * Return: * int The approximate current number of items allocated from the zone */ int uma_zone_get_cur(uma_zone_t zone); /* * The following two routines (uma_zone_set_init/fini) * are used to set the backend init/fini pair which acts on an * object as it becomes allocated and is placed in a slab within * the specified zone's backing keg. These should probably not * be changed once allocations have already begun, but only be set * immediately upon zone creation. */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit); void uma_zone_set_fini(uma_zone_t zone, uma_fini fini); /* * The following two routines (uma_zone_set_zinit/zfini) are * used to set the zinit/zfini pair which acts on an object as * it passes from the backing Keg's slab cache to the * specified Zone's bucket cache. These should probably not * be changed once allocations have already begun, but only be set * immediately upon zone creation. */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit); void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini); /* * Replaces the standard backend allocator for this zone. * * Arguments: * zone The zone whose backend allocator is being changed. * allocf A pointer to the allocation function * * Returns: * Nothing * * Discussion: * This could be used to implement pageable allocation, or perhaps * even DMA allocators if used in conjunction with the OFFPAGE * zone flag. */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf); /* * Used for freeing memory provided by the allocf above * * Arguments: * zone The zone that intends to use this free routine. * freef The page freeing routine. * * Returns: * Nothing */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef); /* * Associate a zone with a smr context that is allocated after creation * so that multiple zones may share the same context. */ void uma_zone_set_smr(uma_zone_t zone, smr_t smr); /* * Fetch the smr context that was set or made in uma_zcreate(). */ smr_t uma_zone_get_smr(uma_zone_t zone); /* * These flags are setable in the allocf and visible in the freef. */ #define UMA_SLAB_BOOT 0x01 /* Slab alloced from boot pages */ #define UMA_SLAB_KERNEL 0x04 /* Slab alloced from kmem */ #define UMA_SLAB_PRIV 0x08 /* Slab alloced from priv allocator */ /* 0x02, 0x10, 0x40, and 0x80 are available */ /* * Used to pre-fill a zone with some number of items * * Arguments: * zone The zone to fill * itemcnt The number of items to reserve * * Returns: * Nothing * * NOTE: This is blocking and should only be done at startup */ void uma_prealloc(uma_zone_t zone, int itemcnt); /* * Used to determine if a fixed-size zone is exhausted. * * Arguments: * zone The zone to check * * Returns: * Non-zero if zone is exhausted. */ int uma_zone_exhausted(uma_zone_t zone); /* * Returns the bytes of memory consumed by the zone. */ size_t uma_zone_memory(uma_zone_t zone); /* * Common UMA_ZONE_PCPU zones. */ -extern uma_zone_t pcpu_zone_int; -extern uma_zone_t pcpu_zone_64; +extern uma_zone_t pcpu_zone_4; +extern uma_zone_t pcpu_zone_8; /* * Exported statistics structures to be used by user space monitoring tools. * Statistics stream consists of a uma_stream_header, followed by a series of * alternative uma_type_header and uma_type_stat structures. */ #define UMA_STREAM_VERSION 0x00000001 struct uma_stream_header { uint32_t ush_version; /* Stream format version. */ uint32_t ush_maxcpus; /* Value of MAXCPU for stream. */ uint32_t ush_count; /* Number of records. */ uint32_t _ush_pad; /* Pad/reserved field. */ }; #define UTH_MAX_NAME 32 #define UTH_ZONE_SECONDARY 0x00000001 struct uma_type_header { /* * Static per-zone data, some extracted from the supporting keg. */ char uth_name[UTH_MAX_NAME]; uint32_t uth_align; /* Keg: alignment. */ uint32_t uth_size; /* Keg: requested size of item. */ uint32_t uth_rsize; /* Keg: real size of item. */ uint32_t uth_maxpages; /* Keg: maximum number of pages. */ uint32_t uth_limit; /* Keg: max items to allocate. */ /* * Current dynamic zone/keg-derived statistics. */ uint32_t uth_pages; /* Keg: pages allocated. */ uint32_t uth_keg_free; /* Keg: items free. */ uint32_t uth_zone_free; /* Zone: items free. */ uint32_t uth_bucketsize; /* Zone: desired bucket size. */ uint32_t uth_zone_flags; /* Zone: flags. */ uint64_t uth_allocs; /* Zone: number of allocations. */ uint64_t uth_frees; /* Zone: number of frees. */ uint64_t uth_fails; /* Zone: number of alloc failures. */ uint64_t uth_sleeps; /* Zone: number of alloc sleeps. */ uint64_t uth_xdomain; /* Zone: Number of cross domain frees. */ uint64_t _uth_reserved1[1]; /* Reserved. */ }; struct uma_percpu_stat { uint64_t ups_allocs; /* Cache: number of allocations. */ uint64_t ups_frees; /* Cache: number of frees. */ uint64_t ups_cache_free; /* Cache: free items in cache. */ uint64_t _ups_reserved[5]; /* Reserved. */ }; void uma_reclaim_wakeup(void); void uma_reclaim_worker(void *); unsigned long uma_limit(void); /* Return the amount of memory managed by UMA. */ unsigned long uma_size(void); /* Return the amount of memory remaining. May be negative. */ long uma_avail(void); #endif /* _VM_UMA_H_ */