Index: projects/calloutng/sys/kern/kern_condvar.c =================================================================== --- projects/calloutng/sys/kern/kern_condvar.c (revision 237201) +++ projects/calloutng/sys/kern/kern_condvar.c (revision 237202) @@ -1,453 +1,457 @@ /*- * Copyright (c) 2000 Jake Burkholder . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif /* * Common sanity checks for cv_wait* functions. */ #define CV_ASSERT(cvp, lock, td) do { \ KASSERT((td) != NULL, ("%s: curthread NULL", __func__)); \ KASSERT(TD_IS_RUNNING(td), ("%s: not TDS_RUNNING", __func__)); \ KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__)); \ KASSERT((lock) != NULL, ("%s: lock NULL", __func__)); \ } while (0) /* * Initialize a condition variable. Must be called before use. */ void cv_init(struct cv *cvp, const char *desc) { cvp->cv_description = desc; cvp->cv_waiters = 0; } /* * Destroy a condition variable. The condition variable must be re-initialized * in order to be re-used. */ void cv_destroy(struct cv *cvp) { #ifdef INVARIANTS struct sleepqueue *sq; sleepq_lock(cvp); sq = sleepq_lookup(cvp); sleepq_release(cvp); KASSERT(sq == NULL, ("%s: associated sleep queue non-empty", __func__)); #endif } /* * Wait on a condition variable. The current thread is placed on the condition * variable's wait queue and suspended. A cv_signal or cv_broadcast on the same * condition variable will resume the thread. The mutex is released before * sleeping and will be held on return. It is recommended that the mutex be * held when cv_signal or cv_broadcast are called. */ void _cv_wait(struct cv *cvp, struct lock_object *lock) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; int lock_state; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (cold || panicstr) { /* * During autoconfiguration, just give interrupts * a chance, then just return. Don't run any other * thread or panic below, in case this is the idle * process and already asleep. */ return; } sleepq_lock(cvp); cvp->cv_waiters++; if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } sleepq_wait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } } /* * Wait on a condition variable. This function differs from cv_wait by * not aquiring the mutex after condition variable was signaled. */ void _cv_wait_unlock(struct cv *cvp, struct lock_object *lock) { struct lock_class *class; struct thread *td; td = curthread; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); KASSERT(lock != &Giant.lock_object, ("cv_wait_unlock cannot be used with Giant")); class = LOCK_CLASS(lock); if (cold || panicstr) { /* * During autoconfiguration, just give interrupts * a chance, then just return. Don't run any other * thread or panic below, in case this is the idle * process and already asleep. */ class->lc_unlock(lock); return; } sleepq_lock(cvp); cvp->cv_waiters++; DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); sleepq_wait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); } /* * Wait on a condition variable, allowing interruption by signals. Return 0 if * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if * a signal was caught. If ERESTART is returned the system call should be * restarted if possible. */ int _cv_wait_sig(struct cv *cvp, struct lock_object *lock) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; int lock_state, rval; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (cold || panicstr) { /* * After a panic, or during autoconfiguration, just give * interrupts a chance, then just return; don't run any other * procs or panic below, in case this is the idle process and * already asleep. */ return (0); } sleepq_lock(cvp); cvp->cv_waiters++; if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } rval = sleepq_wait_sig(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Wait on a condition variable for at most timo/hz seconds. Returns 0 if the * process was resumed by cv_signal or cv_broadcast, EWOULDBLOCK if the timeout * expires. */ int _cv_timedwait(struct cv *cvp, struct lock_object *lock, int timo) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; int lock_state, rval; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (cold || panicstr) { /* * After a panic, or during autoconfiguration, just give * interrupts a chance, then just return; don't run any other * thread or panic below, in case this is the idle process and * already asleep. */ return 0; } sleepq_lock(cvp); cvp->cv_waiters++; if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); sleepq_set_timeout(cvp, timo); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } rval = sleepq_timedwait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Wait on a condition variable for at most timo/hz seconds, allowing * interruption by signals. Returns 0 if the thread was resumed by cv_signal * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if * a signal was caught. */ int -_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, int timo) +_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, + struct bintime *bt, int timo) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; int lock_state, rval; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (cold || panicstr) { /* * After a panic, or during autoconfiguration, just give * interrupts a chance, then just return; don't run any other * thread or panic below, in case this is the idle process and * already asleep. */ return 0; } sleepq_lock(cvp); cvp->cv_waiters++; if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); - sleepq_set_timeout(cvp, timo); + if (bt == NULL) + sleepq_set_timeout(cvp, timo); + else + sleepq_set_timeout_bt(cvp, *bt); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } rval = sleepq_timedwait_sig(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Signal a condition variable, wakes up one waiting thread. Will also wakeup * the swapper if the process is not in memory, so that it can bring the * sleeping process in. Note that this may also result in additional threads * being made runnable. Should be called with the same mutex as was passed to * cv_wait held. */ void cv_signal(struct cv *cvp) { int wakeup_swapper; wakeup_swapper = 0; sleepq_lock(cvp); if (cvp->cv_waiters > 0) { cvp->cv_waiters--; wakeup_swapper = sleepq_signal(cvp, SLEEPQ_CONDVAR, 0, 0); } sleepq_release(cvp); if (wakeup_swapper) kick_proc0(); } /* * Broadcast a signal to a condition variable. Wakes up all waiting threads. * Should be called with the same mutex as was passed to cv_wait held. */ void cv_broadcastpri(struct cv *cvp, int pri) { int wakeup_swapper; /* * XXX sleepq_broadcast pri argument changed from -1 meaning * no pri to 0 meaning no pri. */ wakeup_swapper = 0; if (pri == -1) pri = 0; sleepq_lock(cvp); if (cvp->cv_waiters > 0) { cvp->cv_waiters = 0; wakeup_swapper = sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri, 0); } sleepq_release(cvp); if (wakeup_swapper) kick_proc0(); } Index: projects/calloutng/sys/kern/subr_sleepqueue.c =================================================================== --- projects/calloutng/sys/kern/subr_sleepqueue.c (revision 237201) +++ projects/calloutng/sys/kern/subr_sleepqueue.c (revision 237202) @@ -1,1255 +1,1245 @@ /*- * Copyright (c) 2004 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Implementation of sleep queues used to hold queue of threads blocked on * a wait channel. Sleep queues different from turnstiles in that wait * channels are not owned by anyone, so there is no priority propagation. * Sleep queues can also provide a timeout and can also be interrupted by * signals. That said, there are several similarities between the turnstile * and sleep queue implementations. (Note: turnstiles were implemented * first.) For example, both use a hash table of the same size where each * bucket is referred to as a "chain" that contains both a spin lock and * a linked list of queues. An individual queue is located by using a hash * to pick a chain, locking the chain, and then walking the chain searching * for the queue. This means that a wait channel object does not need to * embed it's queue head just as locks do not embed their turnstile queue * head. Threads also carry around a sleep queue that they lend to the * wait channel when blocking. Just as in turnstiles, the queue includes * a free list of the sleep queues of other threads blocked on the same * wait channel in the case of multiple waiters. * * Some additional functionality provided by sleep queues include the * ability to set a timeout. The timeout is managed using a per-thread * callout that resumes a thread if it is asleep. A thread may also * catch signals while it is asleep (aka an interruptible sleep). The * signal code uses sleepq_abort() to interrupt a sleeping thread. Finally, * sleep queues also provide some extra assertions. One is not allowed to * mix the sleep/wakeup and cv APIs for a given wait channel. Also, one * must consistently use the same lock to synchronize with a wait channel, * though this check is currently only a warning for sleep/wakeup due to * pre-existing abuse of that API. The same lock must also be held when * awakening threads, though that is currently only enforced for condition * variables. */ #include __FBSDID("$FreeBSD$"); #include "opt_sleepqueue_profiling.h" #include "opt_ddb.h" #include "opt_kdtrace.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* * Constants for the hash table of sleep queue chains. These constants are * the same ones that 4BSD (and possibly earlier versions of BSD) used. * Basically, we ignore the lower 8 bits of the address since most wait * channel pointers are aligned and only look at the next 7 bits for the * hash. SC_TABLESIZE must be a power of two for SC_MASK to work properly. */ #define SC_TABLESIZE 128 /* Must be power of 2. */ #define SC_MASK (SC_TABLESIZE - 1) #define SC_SHIFT 8 #define SC_HASH(wc) (((uintptr_t)(wc) >> SC_SHIFT) & SC_MASK) #define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)] #define NR_SLEEPQS 2 /* * There two different lists of sleep queues. Both lists are connected * via the sq_hash entries. The first list is the sleep queue chain list * that a sleep queue is on when it is attached to a wait channel. The * second list is the free list hung off of a sleep queue that is attached * to a wait channel. * * Each sleep queue also contains the wait channel it is attached to, the * list of threads blocked on that wait channel, flags specific to the * wait channel, and the lock used to synchronize with a wait channel. * The flags are used to catch mismatches between the various consumers * of the sleep queue API (e.g. sleep/wakeup and condition variables). * The lock pointer is only used when invariants are enabled for various * debugging checks. * * Locking key: * c - sleep queue chain lock */ struct sleepqueue { TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */ LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */ LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */ void *sq_wchan; /* (c) Wait channel. */ int sq_type; /* (c) Queue type. */ #ifdef INVARIANTS struct lock_object *sq_lock; /* (c) Associated lock. */ #endif }; struct sleepqueue_chain { LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */ struct mtx sc_lock; /* Spin lock for this chain. */ #ifdef SLEEPQUEUE_PROFILING u_int sc_depth; /* Length of sc_queues. */ u_int sc_max_depth; /* Max length of sc_queues. */ #endif }; #ifdef SLEEPQUEUE_PROFILING u_int sleepq_max_depth; static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling"); static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0, "sleepq chain stats"); SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth, 0, "maxmimum depth achieved of a single chain"); static void sleepq_profile(const char *wmesg); static int prof_enabled; #endif static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE]; static uma_zone_t sleepq_zone; /* * Prototypes for non-exported routines. */ static int sleepq_catch_signals(void *wchan, int pri); static int sleepq_check_signals(void); static int sleepq_check_timeout(void); #ifdef INVARIANTS static void sleepq_dtor(void *mem, int size, void *arg); #endif static int sleepq_init(void *mem, int size, int flags); static int sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri); static void sleepq_switch(void *wchan, int pri); static void sleepq_timeout(void *arg); SDT_PROBE_DECLARE(sched, , , sleep); SDT_PROBE_DECLARE(sched, , , wakeup); /* * Early initialization of sleep queues that is called from the sleepinit() * SYSINIT. */ void init_sleepqueues(void) { #ifdef SLEEPQUEUE_PROFILING struct sysctl_oid *chain_oid; char chain_name[10]; #endif int i; for (i = 0; i < SC_TABLESIZE; i++) { LIST_INIT(&sleepq_chains[i].sc_queues); mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL, MTX_SPIN | MTX_RECURSE); #ifdef SLEEPQUEUE_PROFILING snprintf(chain_name, sizeof(chain_name), "%d", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO, chain_name, CTLFLAG_RD, NULL, "sleepq chain stats"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0, NULL); #endif } sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue), #ifdef INVARIANTS NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0); #else NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0); #endif thread0.td_sleepqueue = sleepq_alloc(); } /* * Get a sleep queue for a new thread. */ struct sleepqueue * sleepq_alloc(void) { return (uma_zalloc(sleepq_zone, M_WAITOK)); } /* * Free a sleep queue when a thread is destroyed. */ void sleepq_free(struct sleepqueue *sq) { uma_zfree(sleepq_zone, sq); } /* * Lock the sleep queue chain associated with the specified wait channel. */ void sleepq_lock(void *wchan) { struct sleepqueue_chain *sc; sc = SC_LOOKUP(wchan); mtx_lock_spin(&sc->sc_lock); } /* * Look up the sleep queue associated with a given wait channel in the hash * table locking the associated sleep queue chain. If no queue is found in * the table, NULL is returned. */ struct sleepqueue * sleepq_lookup(void *wchan) { struct sleepqueue_chain *sc; struct sleepqueue *sq; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); LIST_FOREACH(sq, &sc->sc_queues, sq_hash) if (sq->sq_wchan == wchan) return (sq); return (NULL); } /* * Unlock the sleep queue chain associated with a given wait channel. */ void sleepq_release(void *wchan) { struct sleepqueue_chain *sc; sc = SC_LOOKUP(wchan); mtx_unlock_spin(&sc->sc_lock); } /* * Places the current thread on the sleep queue for the specified wait * channel. If INVARIANTS is enabled, then it associates the passed in * lock with the sleepq to make sure it is held when that sleep queue is * woken up. */ void sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags, int queue) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(td->td_sleepqueue != NULL); MPASS(wchan != NULL); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); /* If this thread is not allowed to sleep, die a horrible death. */ KASSERT(!(td->td_pflags & TDP_NOSLEEPING), ("Trying sleep, but thread marked as sleeping prohibited")); /* Look up the sleep queue associated with the wait channel 'wchan'. */ sq = sleepq_lookup(wchan); /* * If the wait channel does not already have a sleep queue, use * this thread's sleep queue. Otherwise, insert the current thread * into the sleep queue already in use by this wait channel. */ if (sq == NULL) { #ifdef INVARIANTS int i; sq = td->td_sleepqueue; for (i = 0; i < NR_SLEEPQS; i++) { KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]), ("thread's sleep queue %d is not empty", i)); KASSERT(sq->sq_blockedcnt[i] == 0, ("thread's sleep queue %d count mismatches", i)); } KASSERT(LIST_EMPTY(&sq->sq_free), ("thread's sleep queue has a non-empty free list")); KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer")); sq->sq_lock = lock; #endif #ifdef SLEEPQUEUE_PROFILING sc->sc_depth++; if (sc->sc_depth > sc->sc_max_depth) { sc->sc_max_depth = sc->sc_depth; if (sc->sc_max_depth > sleepq_max_depth) sleepq_max_depth = sc->sc_max_depth; } #endif sq = td->td_sleepqueue; LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash); sq->sq_wchan = wchan; sq->sq_type = flags & SLEEPQ_TYPE; } else { MPASS(wchan == sq->sq_wchan); MPASS(lock == sq->sq_lock); MPASS((flags & SLEEPQ_TYPE) == sq->sq_type); LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash); } thread_lock(td); TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq); sq->sq_blockedcnt[queue]++; td->td_sleepqueue = NULL; td->td_sqqueue = queue; td->td_wchan = wchan; td->td_wmesg = wmesg; if (flags & SLEEPQ_INTERRUPTIBLE) { td->td_flags |= TDF_SINTR; td->td_flags &= ~TDF_SLEEPABORT; if (flags & SLEEPQ_STOP_ON_BDRY) td->td_flags |= TDF_SBDRY; } thread_unlock(td); } /* * Sets a timeout that will remove the current thread from the specified * sleep queue after timo ticks if the thread has not already been awakened. */ void -sleepq_set_timeout_bt(void *wchan, struct bintime bt) +_sleepq_set_timeout(void *wchan, struct bintime *bt, int timo) { struct sleepqueue_chain *sc; struct thread *td; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_sleepqueue == NULL); MPASS(wchan != NULL); - callout_reset_bt_on(&td->td_slpcallout, bt, sleepq_timeout, td, PCPU_GET(cpuid), 0); -} - -void -sleepq_set_timeout(void *wchan, int timo) -{ - struct sleepqueue_chain *sc; - struct thread *td; - - td = curthread; - sc = SC_LOOKUP(wchan); - mtx_assert(&sc->sc_lock, MA_OWNED); - MPASS(TD_ON_SLEEPQ(td)); - MPASS(td->td_sleepqueue == NULL); - MPASS(wchan != NULL); - callout_reset_curcpu(&td->td_slpcallout, timo, sleepq_timeout, td); + if (bt == NULL) + callout_reset_curcpu(&td->td_slpcallout, timo, + sleepq_timeout, td); + else + callout_reset_bt_on(&td->td_slpcallout, *bt, + sleepq_timeout, td, PCPU_GET(cpuid), 0); } /* * Return the number of actual sleepers for the specified queue. */ u_int sleepq_sleepcnt(void *wchan, int queue) { struct sleepqueue *sq; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); return (sq->sq_blockedcnt[queue]); } /* * Marks the pending sleep of the current thread as interruptible and * makes an initial check for pending signals before putting a thread * to sleep. Enters and exits with the thread lock held. Thread lock * may have transitioned from the sleepq lock to a run lock. */ static int sleepq_catch_signals(void *wchan, int pri) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; struct proc *p; struct sigacts *ps; int sig, ret, stop_allowed; td = curthread; p = curproc; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(wchan != NULL); if ((td->td_pflags & TDP_WAKEUP) != 0) { td->td_pflags &= ~TDP_WAKEUP; ret = EINTR; thread_lock(td); goto out; } /* * See if there are any pending signals for this thread. If not * we can switch immediately. Otherwise do the signal processing * directly. */ thread_lock(td); if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) == 0) { sleepq_switch(wchan, pri); return (0); } stop_allowed = (td->td_flags & TDF_SBDRY) ? SIG_STOP_NOT_ALLOWED : SIG_STOP_ALLOWED; thread_unlock(td); mtx_unlock_spin(&sc->sc_lock); CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)", (void *)td, (long)p->p_pid, td->td_name); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sig = cursig(td, stop_allowed); if (sig == 0) { mtx_unlock(&ps->ps_mtx); ret = thread_suspend_check(1); MPASS(ret == 0 || ret == EINTR || ret == ERESTART); } else { if (SIGISMEMBER(ps->ps_sigintr, sig)) ret = EINTR; else ret = ERESTART; mtx_unlock(&ps->ps_mtx); } /* * Lock the per-process spinlock prior to dropping the PROC_LOCK * to avoid a signal delivery race. PROC_LOCK, PROC_SLOCK, and * thread_lock() are currently held in tdsendsignal(). */ PROC_SLOCK(p); mtx_lock_spin(&sc->sc_lock); PROC_UNLOCK(p); thread_lock(td); PROC_SUNLOCK(p); if (ret == 0) { sleepq_switch(wchan, pri); return (0); } out: /* * There were pending signals and this thread is still * on the sleep queue, remove it from the sleep queue. */ if (TD_ON_SLEEPQ(td)) { sq = sleepq_lookup(wchan); if (sleepq_resume_thread(sq, td, 0)) { #ifdef INVARIANTS /* * This thread hasn't gone to sleep yet, so it * should not be swapped out. */ panic("not waking up swapper"); #endif } } mtx_unlock_spin(&sc->sc_lock); MPASS(td->td_lock != &sc->sc_lock); return (ret); } /* * Switches to another thread if we are still asleep on a sleep queue. * Returns with thread lock. */ static void sleepq_switch(void *wchan, int pri) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If we have a sleep queue, then we've already been woken up, so * just return. */ if (td->td_sleepqueue != NULL) { mtx_unlock_spin(&sc->sc_lock); return; } /* * If TDF_TIMEOUT is set, then our sleep has been timed out * already but we are still on the sleep queue, so dequeue the * thread and return. */ if (td->td_flags & TDF_TIMEOUT) { MPASS(TD_ON_SLEEPQ(td)); sq = sleepq_lookup(wchan); if (sleepq_resume_thread(sq, td, 0)) { #ifdef INVARIANTS /* * This thread hasn't gone to sleep yet, so it * should not be swapped out. */ panic("not waking up swapper"); #endif } mtx_unlock_spin(&sc->sc_lock); return; } #ifdef SLEEPQUEUE_PROFILING if (prof_enabled) sleepq_profile(td->td_wmesg); #endif MPASS(td->td_sleepqueue == NULL); sched_sleep(td, pri); thread_lock_set(td, &sc->sc_lock); SDT_PROBE0(sched, , , sleep); TD_SET_SLEEPING(td); mi_switch(SW_VOL | SWT_SLEEPQ, NULL); KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); } /* * Check to see if we timed out. */ static int sleepq_check_timeout(void) { struct thread *td; td = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If TDF_TIMEOUT is set, we timed out. */ if (td->td_flags & TDF_TIMEOUT) { td->td_flags &= ~TDF_TIMEOUT; return (EWOULDBLOCK); } /* * If TDF_TIMOFAIL is set, the timeout ran after we had * already been woken up. */ if (td->td_flags & TDF_TIMOFAIL) td->td_flags &= ~TDF_TIMOFAIL; /* * If callout_stop() fails, then the timeout is running on * another CPU, so synchronize with it to avoid having it * accidentally wake up a subsequent sleep. */ else if (callout_stop(&td->td_slpcallout) == 0) { td->td_flags |= TDF_TIMEOUT; TD_SET_SLEEPING(td); mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL); } return (0); } /* * Check to see if we were awoken by a signal. */ static int sleepq_check_signals(void) { struct thread *td; td = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); /* We are no longer in an interruptible sleep. */ if (td->td_flags & TDF_SINTR) td->td_flags &= ~(TDF_SINTR | TDF_SBDRY); if (td->td_flags & TDF_SLEEPABORT) { td->td_flags &= ~TDF_SLEEPABORT; return (td->td_intrval); } return (0); } /* * Block the current thread until it is awakened from its sleep queue. */ void sleepq_wait(void *wchan, int pri) { struct thread *td; td = curthread; MPASS(!(td->td_flags & TDF_SINTR)); thread_lock(td); sleepq_switch(wchan, pri); thread_unlock(td); } /* * Block the current thread until it is awakened from its sleep queue * or it is interrupted by a signal. */ int sleepq_wait_sig(void *wchan, int pri) { int rcatch; int rval; rcatch = sleepq_catch_signals(wchan, pri); rval = sleepq_check_signals(); thread_unlock(curthread); if (rcatch) return (rcatch); return (rval); } /* * Block the current thread until it is awakened from its sleep queue * or it times out while waiting. */ int sleepq_timedwait(void *wchan, int pri) { struct thread *td; int rval; td = curthread; MPASS(!(td->td_flags & TDF_SINTR)); thread_lock(td); sleepq_switch(wchan, pri); rval = sleepq_check_timeout(); thread_unlock(td); return (rval); } /* * Block the current thread until it is awakened from its sleep queue, * it is interrupted by a signal, or it times out waiting to be awakened. */ int sleepq_timedwait_sig(void *wchan, int pri) { int rcatch, rvalt, rvals; rcatch = sleepq_catch_signals(wchan, pri); rvalt = sleepq_check_timeout(); rvals = sleepq_check_signals(); thread_unlock(curthread); if (rcatch) return (rcatch); if (rvals) return (rvals); return (rvalt); } /* * Returns the type of sleepqueue given a waitchannel. */ int sleepq_type(void *wchan) { struct sleepqueue *sq; int type; MPASS(wchan != NULL); sleepq_lock(wchan); sq = sleepq_lookup(wchan); if (sq == NULL) { sleepq_release(wchan); return (-1); } type = sq->sq_type; sleepq_release(wchan); return (type); } /* * Removes a thread from a sleep queue and makes it * runnable. */ static int sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri) { struct sleepqueue_chain *sc; MPASS(td != NULL); MPASS(sq->sq_wchan != NULL); MPASS(td->td_wchan == sq->sq_wchan); MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0); THREAD_LOCK_ASSERT(td, MA_OWNED); sc = SC_LOOKUP(sq->sq_wchan); mtx_assert(&sc->sc_lock, MA_OWNED); SDT_PROBE2(sched, , , wakeup, td, td->td_proc); /* Remove the thread from the queue. */ sq->sq_blockedcnt[td->td_sqqueue]--; TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq); /* * Get a sleep queue for this thread. If this is the last waiter, * use the queue itself and take it out of the chain, otherwise, * remove a queue from the free list. */ if (LIST_EMPTY(&sq->sq_free)) { td->td_sleepqueue = sq; #ifdef INVARIANTS sq->sq_wchan = NULL; #endif #ifdef SLEEPQUEUE_PROFILING sc->sc_depth--; #endif } else td->td_sleepqueue = LIST_FIRST(&sq->sq_free); LIST_REMOVE(td->td_sleepqueue, sq_hash); td->td_wmesg = NULL; td->td_wchan = NULL; td->td_flags &= ~(TDF_SINTR | TDF_SBDRY); CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, td->td_name); /* Adjust priority if requested. */ MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX)); if (pri != 0 && td->td_priority > pri && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, pri); /* * Note that thread td might not be sleeping if it is running * sleepq_catch_signals() on another CPU or is blocked on its * proc lock to check signals. There's no need to mark the * thread runnable in that case. */ if (TD_IS_SLEEPING(td)) { TD_CLR_SLEEPING(td); return (setrunnable(td)); } return (0); } #ifdef INVARIANTS /* * UMA zone item deallocator. */ static void sleepq_dtor(void *mem, int size, void *arg) { struct sleepqueue *sq; int i; sq = mem; for (i = 0; i < NR_SLEEPQS; i++) { MPASS(TAILQ_EMPTY(&sq->sq_blocked[i])); MPASS(sq->sq_blockedcnt[i] == 0); } } #endif /* * UMA zone item initializer. */ static int sleepq_init(void *mem, int size, int flags) { struct sleepqueue *sq; int i; bzero(mem, size); sq = mem; for (i = 0; i < NR_SLEEPQS; i++) { TAILQ_INIT(&sq->sq_blocked[i]); sq->sq_blockedcnt[i] = 0; } LIST_INIT(&sq->sq_free); return (0); } /* * Find the highest priority thread sleeping on a wait channel and resume it. */ int sleepq_signal(void *wchan, int flags, int pri, int queue) { struct sleepqueue *sq; struct thread *td, *besttd; int wakeup_swapper; CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags); KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); /* * Find the highest priority thread on the queue. If there is a * tie, use the thread that first appears in the queue as it has * been sleeping the longest since threads are always added to * the tail of sleep queues. */ besttd = NULL; TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) { if (besttd == NULL || td->td_priority < besttd->td_priority) besttd = td; } MPASS(besttd != NULL); thread_lock(besttd); wakeup_swapper = sleepq_resume_thread(sq, besttd, pri); thread_unlock(besttd); return (wakeup_swapper); } /* * Resume all threads sleeping on a specified wait channel. */ int sleepq_broadcast(void *wchan, int flags, int pri, int queue) { struct sleepqueue *sq; struct thread *td, *tdn; int wakeup_swapper; CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags); KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); /* Resume all blocked threads on the sleep queue. */ wakeup_swapper = 0; TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) { thread_lock(td); if (sleepq_resume_thread(sq, td, pri)) wakeup_swapper = 1; thread_unlock(td); } return (wakeup_swapper); } /* * Time sleeping threads out. When the timeout expires, the thread is * removed from the sleep queue and made runnable if it is still asleep. */ static void sleepq_timeout(void *arg) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; void *wchan; int wakeup_swapper; td = arg; wakeup_swapper = 0; CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); /* * First, see if the thread is asleep and get the wait channel if * it is. */ thread_lock(td); if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { wchan = td->td_wchan; sc = SC_LOOKUP(wchan); THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock); sq = sleepq_lookup(wchan); MPASS(sq != NULL); td->td_flags |= TDF_TIMEOUT; wakeup_swapper = sleepq_resume_thread(sq, td, 0); thread_unlock(td); if (wakeup_swapper) kick_proc0(); return; } /* * If the thread is on the SLEEPQ but isn't sleeping yet, it * can either be on another CPU in between sleepq_add() and * one of the sleepq_*wait*() routines or it can be in * sleepq_catch_signals(). */ if (TD_ON_SLEEPQ(td)) { td->td_flags |= TDF_TIMEOUT; thread_unlock(td); return; } /* * Now check for the edge cases. First, if TDF_TIMEOUT is set, * then the other thread has already yielded to us, so clear * the flag and resume it. If TDF_TIMEOUT is not set, then the * we know that the other thread is not on a sleep queue, but it * hasn't resumed execution yet. In that case, set TDF_TIMOFAIL * to let it know that the timeout has already run and doesn't * need to be canceled. */ if (td->td_flags & TDF_TIMEOUT) { MPASS(TD_IS_SLEEPING(td)); td->td_flags &= ~TDF_TIMEOUT; TD_CLR_SLEEPING(td); wakeup_swapper = setrunnable(td); } else td->td_flags |= TDF_TIMOFAIL; thread_unlock(td); if (wakeup_swapper) kick_proc0(); } /* * Resumes a specific thread from the sleep queue associated with a specific * wait channel if it is on that queue. */ void sleepq_remove(struct thread *td, void *wchan) { struct sleepqueue *sq; int wakeup_swapper; /* * Look up the sleep queue for this wait channel, then re-check * that the thread is asleep on that channel, if it is not, then * bail. */ MPASS(wchan != NULL); sleepq_lock(wchan); sq = sleepq_lookup(wchan); /* * We can not lock the thread here as it may be sleeping on a * different sleepq. However, holding the sleepq lock for this * wchan can guarantee that we do not miss a wakeup for this * channel. The asserts below will catch any false positives. */ if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) { sleepq_release(wchan); return; } /* Thread is asleep on sleep queue sq, so wake it up. */ thread_lock(td); MPASS(sq != NULL); MPASS(td->td_wchan == wchan); wakeup_swapper = sleepq_resume_thread(sq, td, 0); thread_unlock(td); sleepq_release(wchan); if (wakeup_swapper) kick_proc0(); } /* * Abort a thread as if an interrupt had occurred. Only abort * interruptible waits (unfortunately it isn't safe to abort others). */ int sleepq_abort(struct thread *td, int intrval) { struct sleepqueue *sq; void *wchan; THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_flags & TDF_SINTR); MPASS(intrval == EINTR || intrval == ERESTART); /* * If the TDF_TIMEOUT flag is set, just leave. A * timeout is scheduled anyhow. */ if (td->td_flags & TDF_TIMEOUT) return (0); CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); td->td_intrval = intrval; td->td_flags |= TDF_SLEEPABORT; /* * If the thread has not slept yet it will find the signal in * sleepq_catch_signals() and call sleepq_resume_thread. Otherwise * we have to do it here. */ if (!TD_IS_SLEEPING(td)) return (0); wchan = td->td_wchan; MPASS(wchan != NULL); sq = sleepq_lookup(wchan); MPASS(sq != NULL); /* Thread is asleep on sleep queue sq, so wake it up. */ return (sleepq_resume_thread(sq, td, 0)); } #ifdef SLEEPQUEUE_PROFILING #define SLEEPQ_PROF_LOCATIONS 1024 #define SLEEPQ_SBUFSIZE 512 struct sleepq_prof { LIST_ENTRY(sleepq_prof) sp_link; const char *sp_wmesg; long sp_count; }; LIST_HEAD(sqphead, sleepq_prof); struct sqphead sleepq_prof_free; struct sqphead sleepq_hash[SC_TABLESIZE]; static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS]; static struct mtx sleepq_prof_lock; MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN); static void sleepq_profile(const char *wmesg) { struct sleepq_prof *sp; mtx_lock_spin(&sleepq_prof_lock); if (prof_enabled == 0) goto unlock; LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link) if (sp->sp_wmesg == wmesg) goto done; sp = LIST_FIRST(&sleepq_prof_free); if (sp == NULL) goto unlock; sp->sp_wmesg = wmesg; LIST_REMOVE(sp, sp_link); LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link); done: sp->sp_count++; unlock: mtx_unlock_spin(&sleepq_prof_lock); return; } static void sleepq_prof_reset(void) { struct sleepq_prof *sp; int enabled; int i; mtx_lock_spin(&sleepq_prof_lock); enabled = prof_enabled; prof_enabled = 0; for (i = 0; i < SC_TABLESIZE; i++) LIST_INIT(&sleepq_hash[i]); LIST_INIT(&sleepq_prof_free); for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) { sp = &sleepq_profent[i]; sp->sp_wmesg = NULL; sp->sp_count = 0; LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link); } prof_enabled = enabled; mtx_unlock_spin(&sleepq_prof_lock); } static int enable_sleepq_prof(SYSCTL_HANDLER_ARGS) { int error, v; v = prof_enabled; error = sysctl_handle_int(oidp, &v, v, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == prof_enabled) return (0); if (v == 1) sleepq_prof_reset(); mtx_lock_spin(&sleepq_prof_lock); prof_enabled = !!v; mtx_unlock_spin(&sleepq_prof_lock); return (0); } static int reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS) { int error, v; v = 0; error = sysctl_handle_int(oidp, &v, 0, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == 0) return (0); sleepq_prof_reset(); return (0); } static int dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS) { struct sleepq_prof *sp; struct sbuf *sb; int enabled; int error; int i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req); sbuf_printf(sb, "\nwmesg\tcount\n"); enabled = prof_enabled; mtx_lock_spin(&sleepq_prof_lock); prof_enabled = 0; mtx_unlock_spin(&sleepq_prof_lock); for (i = 0; i < SC_TABLESIZE; i++) { LIST_FOREACH(sp, &sleepq_hash[i], sp_link) { sbuf_printf(sb, "%s\t%ld\n", sp->sp_wmesg, sp->sp_count); } } mtx_lock_spin(&sleepq_prof_lock); prof_enabled = enabled; mtx_unlock_spin(&sleepq_prof_lock); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics"); SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, reset_sleepq_prof_stats, "I", "Reset sleepqueue profiling statistics"); SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling"); #endif #ifdef DDB DB_SHOW_COMMAND(sleepq, db_show_sleepqueue) { struct sleepqueue_chain *sc; struct sleepqueue *sq; #ifdef INVARIANTS struct lock_object *lock; #endif struct thread *td; void *wchan; int i; if (!have_addr) return; /* * First, see if there is an active sleep queue for the wait channel * indicated by the address. */ wchan = (void *)addr; sc = SC_LOOKUP(wchan); LIST_FOREACH(sq, &sc->sc_queues, sq_hash) if (sq->sq_wchan == wchan) goto found; /* * Second, see if there is an active sleep queue at the address * indicated. */ for (i = 0; i < SC_TABLESIZE; i++) LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) { if (sq == (struct sleepqueue *)addr) goto found; } db_printf("Unable to locate a sleep queue via %p\n", (void *)addr); return; found: db_printf("Wait channel: %p\n", sq->sq_wchan); db_printf("Queue type: %d\n", sq->sq_type); #ifdef INVARIANTS if (sq->sq_lock) { lock = sq->sq_lock; db_printf("Associated Interlock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name, lock->lo_name); } #endif db_printf("Blocked threads:\n"); for (i = 0; i < NR_SLEEPQS; i++) { db_printf("\nQueue[%d]:\n", i); if (TAILQ_EMPTY(&sq->sq_blocked[i])) db_printf("\tempty\n"); else TAILQ_FOREACH(td, &sq->sq_blocked[0], td_slpq) { db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); } db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]); } } /* Alias 'show sleepqueue' to 'show sleepq'. */ DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue); #endif Index: projects/calloutng/sys/kern/sys_generic.c =================================================================== --- projects/calloutng/sys/kern/sys_generic.c (revision 237201) +++ projects/calloutng/sys/kern/sys_generic.c (revision 237202) @@ -1,1770 +1,1775 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include int iosize_max_clamp = 1; SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); /* * Assert that the return value of read(2) and write(2) syscalls fits * into a register. If not, an architecture will need to provide the * usermode wrappers to reconstruct the result. */ CTASSERT(sizeof(register_t) >= sizeof(size_t)); static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); static int pollout(struct thread *, struct pollfd *, struct pollfd *, u_int); static int pollscan(struct thread *, struct pollfd *, u_int); static int pollrescan(struct thread *); static int selscan(struct thread *, fd_mask **, fd_mask **, int); static int selrescan(struct thread *, fd_mask **, fd_mask **); static void selfdalloc(struct thread *, void *); static void selfdfree(struct seltd *, struct selfd *); static int dofileread(struct thread *, int, struct file *, struct uio *, off_t, int); static int dofilewrite(struct thread *, int, struct file *, struct uio *, off_t, int); static void doselwakeup(struct selinfo *, int); static void seltdinit(struct thread *); -static int seltdwait(struct thread *, int); +static int seltdwait(struct thread *, struct bintime *, int); static void seltdclear(struct thread *); /* * One seltd per-thread allocated on demand as needed. * * t - protected by st_mtx * k - Only accessed by curthread or read-only */ struct seltd { STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ struct selfd *st_free1; /* (k) free fd for read set. */ struct selfd *st_free2; /* (k) free fd for write set. */ struct mtx st_mtx; /* Protects struct seltd */ struct cv st_wait; /* (t) Wait channel. */ int st_flags; /* (t) SELTD_ flags. */ }; #define SELTD_PENDING 0x0001 /* We have pending events. */ #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ /* * One selfd allocated per-thread per-file-descriptor. * f - protected by sf_mtx */ struct selfd { STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ struct selinfo *sf_si; /* (f) selinfo when linked. */ struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ struct seltd *sf_td; /* (k) owning seltd. */ void *sf_cookie; /* (k) fd or pollfd. */ }; static uma_zone_t selfd_zone; static struct mtx_pool *mtxpool_select; #ifndef _SYS_SYSPROTO_H_ struct read_args { int fd; void *buf; size_t nbyte; }; #endif int sys_read(td, uap) struct thread *td; struct read_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_readv(td, uap->fd, &auio); return(error); } /* * Positioned read system call */ #ifndef _SYS_SYSPROTO_H_ struct pread_args { int fd; void *buf; size_t nbyte; int pad; off_t offset; }; #endif int sys_pread(td, uap) struct thread *td; struct pread_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_preadv(td, uap->fd, &auio, uap->offset); return(error); } int freebsd6_pread(td, uap) struct thread *td; struct freebsd6_pread_args *uap; { struct pread_args oargs; oargs.fd = uap->fd; oargs.buf = uap->buf; oargs.nbyte = uap->nbyte; oargs.offset = uap->offset; return (sys_pread(td, &oargs)); } /* * Scatter read system call. */ #ifndef _SYS_SYSPROTO_H_ struct readv_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int sys_readv(struct thread *td, struct readv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_readv(td, uap->fd, auio); free(auio, M_IOV); return (error); } int kern_readv(struct thread *td, int fd, struct uio *auio) { struct file *fp; int error; error = fget_read(td, fd, CAP_READ | CAP_SEEK, &fp); if (error) return (error); error = dofileread(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Scatter positioned read system call. */ #ifndef _SYS_SYSPROTO_H_ struct preadv_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int sys_preadv(struct thread *td, struct preadv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_preadv(td, uap->fd, auio, uap->offset); free(auio, M_IOV); return (error); } int kern_preadv(td, fd, auio, offset) struct thread *td; int fd; struct uio *auio; off_t offset; { struct file *fp; int error; error = fget_read(td, fd, CAP_READ, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && fp->f_vnode->v_type != VCHR) error = EINVAL; else error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for readv and preadv that reads data in * from a file using the passed in uio, offset, and flags. */ static int dofileread(td, fd, fp, auio, offset, flags) struct thread *td; int fd; struct file *fp; struct uio *auio; off_t offset; int flags; { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif /* Finish zero length reads right here */ if (auio->uio_resid == 0) { td->td_retval[0] = 0; return(0); } auio->uio_rw = UIO_READ; auio->uio_offset = offset; auio->uio_td = td; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_READ, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } #ifndef _SYS_SYSPROTO_H_ struct write_args { int fd; const void *buf; size_t nbyte; }; #endif int sys_write(td, uap) struct thread *td; struct write_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_writev(td, uap->fd, &auio); return(error); } /* * Positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwrite_args { int fd; const void *buf; size_t nbyte; int pad; off_t offset; }; #endif int sys_pwrite(td, uap) struct thread *td; struct pwrite_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_pwritev(td, uap->fd, &auio, uap->offset); return(error); } int freebsd6_pwrite(td, uap) struct thread *td; struct freebsd6_pwrite_args *uap; { struct pwrite_args oargs; oargs.fd = uap->fd; oargs.buf = uap->buf; oargs.nbyte = uap->nbyte; oargs.offset = uap->offset; return (sys_pwrite(td, &oargs)); } /* * Gather write system call. */ #ifndef _SYS_SYSPROTO_H_ struct writev_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int sys_writev(struct thread *td, struct writev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_writev(td, uap->fd, auio); free(auio, M_IOV); return (error); } int kern_writev(struct thread *td, int fd, struct uio *auio) { struct file *fp; int error; error = fget_write(td, fd, CAP_WRITE | CAP_SEEK, &fp); if (error) return (error); error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Gather positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwritev_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int sys_pwritev(struct thread *td, struct pwritev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_pwritev(td, uap->fd, auio, uap->offset); free(auio, M_IOV); return (error); } int kern_pwritev(td, fd, auio, offset) struct thread *td; struct uio *auio; int fd; off_t offset; { struct file *fp; int error; error = fget_write(td, fd, CAP_WRITE, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && fp->f_vnode->v_type != VCHR) error = EINVAL; else error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for writev and pwritev that writes data to * a file using the passed in uio, offset, and flags. */ static int dofilewrite(td, fd, fp, auio, offset, flags) struct thread *td; int fd; struct file *fp; struct uio *auio; off_t offset; int flags; { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif auio->uio_rw = UIO_WRITE; auio->uio_td = td; auio->uio_offset = offset; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if (fp->f_type == DTYPE_VNODE) bwillwrite(); if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Socket layer is responsible for issuing SIGPIPE. */ if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_WRITE, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } /* * Truncate a file given a file descriptor. * * Can't use fget_write() here, since must return EINVAL and not EBADF if the * descriptor isn't writable. */ int kern_ftruncate(td, fd, length) struct thread *td; int fd; off_t length; { struct file *fp; int error; AUDIT_ARG_FD(fd); if (length < 0) return (EINVAL); error = fget(td, fd, CAP_FTRUNCATE, &fp); if (error) return (error); AUDIT_ARG_FILE(td->td_proc, fp); if (!(fp->f_flag & FWRITE)) { fdrop(fp, td); return (EINVAL); } error = fo_truncate(fp, length, td->td_ucred, td); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ftruncate_args { int fd; int pad; off_t length; }; #endif int sys_ftruncate(td, uap) struct thread *td; struct ftruncate_args *uap; { return (kern_ftruncate(td, uap->fd, uap->length)); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct oftruncate_args { int fd; long length; }; #endif int oftruncate(td, uap) struct thread *td; struct oftruncate_args *uap; { return (kern_ftruncate(td, uap->fd, uap->length)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct ioctl_args { int fd; u_long com; caddr_t data; }; #endif /* ARGSUSED */ int sys_ioctl(struct thread *td, struct ioctl_args *uap) { u_long com; int arg, error; u_int size; caddr_t data; if (uap->com > 0xffffffff) { printf( "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", td->td_proc->p_pid, td->td_name, uap->com); uap->com &= 0xffffffff; } com = uap->com; /* * Interpret high order word to find amount of data to be * copied to/from the user's address space. */ size = IOCPARM_LEN(com); if ((size > IOCPARM_MAX) || ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) ((com & IOC_OUT) && size == 0) || #else ((com & (IOC_IN | IOC_OUT)) && size == 0) || #endif ((com & IOC_VOID) && size > 0 && size != sizeof(int))) return (ENOTTY); if (size > 0) { if (com & IOC_VOID) { /* Integer argument. */ arg = (intptr_t)uap->data; data = (void *)&arg; size = 0; } else data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); } else data = (void *)&uap->data; if (com & IOC_IN) { error = copyin(uap->data, data, (u_int)size); if (error) { if (size > 0) free(data, M_IOCTLOPS); return (error); } } else if (com & IOC_OUT) { /* * Zero the buffer so the user always * gets back something deterministic. */ bzero(data, size); } error = kern_ioctl(td, uap->fd, com, data); if (error == 0 && (com & IOC_OUT)) error = copyout(data, uap->data, (u_int)size); if (size > 0) free(data, M_IOCTLOPS); return (error); } int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) { struct file *fp; struct filedesc *fdp; int error; int tmp; AUDIT_ARG_FD(fd); AUDIT_ARG_CMD(com); if ((error = fget(td, fd, CAP_IOCTL, &fp)) != 0) return (error); if ((fp->f_flag & (FREAD | FWRITE)) == 0) { fdrop(fp, td); return (EBADF); } fdp = td->td_proc->p_fd; switch (com) { case FIONCLEX: FILEDESC_XLOCK(fdp); fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; FILEDESC_XUNLOCK(fdp); goto out; case FIOCLEX: FILEDESC_XLOCK(fdp); fdp->fd_ofileflags[fd] |= UF_EXCLOSE; FILEDESC_XUNLOCK(fdp); goto out; case FIONBIO: if ((tmp = *(int *)data)) atomic_set_int(&fp->f_flag, FNONBLOCK); else atomic_clear_int(&fp->f_flag, FNONBLOCK); data = (void *)&tmp; break; case FIOASYNC: if ((tmp = *(int *)data)) atomic_set_int(&fp->f_flag, FASYNC); else atomic_clear_int(&fp->f_flag, FASYNC); data = (void *)&tmp; break; } error = fo_ioctl(fp, com, data, td->td_ucred, td); out: fdrop(fp, td); return (error); } int poll_no_poll(int events) { /* * Return true for read/write. If the user asked for something * special, return POLLNVAL, so that clients have a way of * determining reliably whether or not the extended * functionality is present without hard-coding knowledge * of specific filesystem implementations. */ if (events & ~POLLSTANDARD) return (POLLNVAL); return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } int sys_pselect(struct thread *td, struct pselect_args *uap) { struct timespec ts; struct timeval tv, *tvp; sigset_t set, *uset; int error; if (uap->ts != NULL) { error = copyin(uap->ts, &ts, sizeof(ts)); if (error != 0) return (error); TIMESPEC_TO_TIMEVAL(&tv, &ts); tvp = &tv; } else tvp = NULL; if (uap->sm != NULL) { error = copyin(uap->sm, &set, sizeof(set)); if (error != 0) return (error); uset = &set; } else uset = NULL; return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, uset, NFDBITS)); } int kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits) { int error; if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &td->td_oldsigmask, 0); if (error != 0) return (error); td->td_pflags |= TDP_OLDMASK; /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); return (error); } #ifndef _SYS_SYSPROTO_H_ struct select_args { int nd; fd_set *in, *ou, *ex; struct timeval *tv; }; #endif int sys_select(struct thread *td, struct select_args *uap) { struct timeval tv, *tvp; int error; if (uap->tv != NULL) { error = copyin(uap->tv, &tv, sizeof(tv)); if (error) return (error); tvp = &tv; } else tvp = NULL; return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, NFDBITS)); } /* * In the unlikely case when user specified n greater then the last * open file descriptor, check that no bits are set after the last * valid fd. We must return EBADF if any is set. * * There are applications that rely on the behaviour. * * nd is fd_lastfile + 1. */ static int select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) { char *addr, *oaddr; int b, i, res; uint8_t bits; if (nd >= ndu || fd_in == NULL) return (0); oaddr = NULL; bits = 0; /* silence gcc */ for (i = nd; i < ndu; i++) { b = i / NBBY; #if BYTE_ORDER == LITTLE_ENDIAN addr = (char *)fd_in + b; #else addr = (char *)fd_in; if (abi_nfdbits == NFDBITS) { addr += rounddown(b, sizeof(fd_mask)) + sizeof(fd_mask) - 1 - b % sizeof(fd_mask); } else { addr += rounddown(b, sizeof(uint32_t)) + sizeof(uint32_t) - 1 - b % sizeof(uint32_t); } #endif if (addr != oaddr) { res = fubyte(addr); if (res == -1) return (EFAULT); oaddr = addr; bits = res; } if ((bits & (1 << (i % NBBY))) != 0) return (EBADF); } return (0); } int kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) { struct filedesc *fdp; /* * The magic 2048 here is chosen to be just enough for FD_SETSIZE * infds with the new FD_SETSIZE of 1024, and more than enough for * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE * of 256. */ fd_mask s_selbits[howmany(2048, NFDBITS)]; fd_mask *ibits[3], *obits[3], *selbits, *sbp; - struct timeval atv, rtv, ttv; + struct bintime abt, rbt; + struct timeval atv; int error, lf, ndu, timo; u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; if (nd < 0) return (EINVAL); fdp = td->td_proc->p_fd; ndu = nd; lf = fdp->fd_lastfile; if (nd > lf + 1) nd = lf + 1; error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); if (error != 0) return (error); error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); if (error != 0) return (error); error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); if (error != 0) return (error); /* * Allocate just enough bits for the non-null fd_sets. Use the * preallocated auto buffer if possible. */ nfdbits = roundup(nd, NFDBITS); ncpbytes = nfdbits / NBBY; ncpubytes = roundup(nd, abi_nfdbits) / NBBY; nbufbytes = 0; if (fd_in != NULL) nbufbytes += 2 * ncpbytes; if (fd_ou != NULL) nbufbytes += 2 * ncpbytes; if (fd_ex != NULL) nbufbytes += 2 * ncpbytes; if (nbufbytes <= sizeof s_selbits) selbits = &s_selbits[0]; else selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); /* * Assign pointers into the bit buffers and fetch the input bits. * Put the output buffers together so that they can be bzeroed * together. */ sbp = selbits; #define getbits(name, x) \ do { \ if (name == NULL) { \ ibits[x] = NULL; \ obits[x] = NULL; \ } else { \ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ obits[x] = sbp; \ sbp += ncpbytes / sizeof *sbp; \ error = copyin(name, ibits[x], ncpubytes); \ if (error != 0) \ goto done; \ bzero((char *)ibits[x] + ncpubytes, \ ncpbytes - ncpubytes); \ } \ } while (0) getbits(fd_in, 0); getbits(fd_ou, 1); getbits(fd_ex, 2); #undef getbits #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) /* * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, * we are running under 32-bit emulation. This should be more * generic. */ #define swizzle_fdset(bits) \ if (abi_nfdbits != NFDBITS && bits != NULL) { \ int i; \ for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ } #else #define swizzle_fdset(bits) #endif /* Make sure the bit order makes it through an ABI transition */ swizzle_fdset(ibits[0]); swizzle_fdset(ibits[1]); swizzle_fdset(ibits[2]); if (nbufbytes != 0) bzero(selbits, nbufbytes / 2); if (tvp != NULL) { atv = *tvp; - if (itimerfix(&atv)) { + if (atv.tv_sec < 0 || atv.tv_usec < 0 || + atv.tv_usec >= 1000000) { error = EINVAL; goto done; } - getmicrouptime(&rtv); - timevaladd(&atv, &rtv); + binuptime(&rbt); + timeval2bintime(&atv, &abt); + bintime_add(&abt, &rbt); } else { - atv.tv_sec = 0; - atv.tv_usec = 0; + abt.sec = 0; + abt.frac = 0; } - timo = 0; seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = selscan(td, ibits, obits, nd); if (error || td->td_retval[0] != 0) break; - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) + if (abt.sec || abt.frac) { + binuptime(&rbt); + if (bintime_cmp(&rbt, &abt, >=)) break; - ttv = atv; - timevalsub(&ttv, &rtv); - timo = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz(&ttv); + error = seltdwait(td, &abt, 0); } - error = seltdwait(td, timo); + else { + timo = 0; + error = seltdwait(td, NULL, timo); + } if (error) break; error = selrescan(td, ibits, obits); if (error || td->td_retval[0] != 0) break; } seltdclear(td); done: /* select is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; /* swizzle bit order back, if necessary */ swizzle_fdset(obits[0]); swizzle_fdset(obits[1]); swizzle_fdset(obits[2]); #undef swizzle_fdset #define putbits(name, x) \ if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ error = error2; if (error == 0) { int error2; putbits(fd_in, 0); putbits(fd_ou, 1); putbits(fd_ex, 2); #undef putbits } if (selbits != &s_selbits[0]) free(selbits, M_SELECT); return (error); } /* * Convert a select bit set to poll flags. * * The backend always returns POLLHUP/POLLERR if appropriate and we * return this as a set bit in any set. */ static int select_flags[3] = { POLLRDNORM | POLLHUP | POLLERR, POLLWRNORM | POLLHUP | POLLERR, POLLRDBAND | POLLERR }; /* * Compute the fo_poll flags required for a fd given by the index and * bit position in the fd_mask array. */ static __inline int selflags(fd_mask **ibits, int idx, fd_mask bit) { int flags; int msk; flags = 0; for (msk = 0; msk < 3; msk++) { if (ibits[msk] == NULL) continue; if ((ibits[msk][idx] & bit) == 0) continue; flags |= select_flags[msk]; } return (flags); } /* * Set the appropriate output bits given a mask of fired events and the * input bits originally requested. */ static __inline int selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) { int msk; int n; n = 0; for (msk = 0; msk < 3; msk++) { if ((events & select_flags[msk]) == 0) continue; if (ibits[msk] == NULL) continue; if ((ibits[msk][idx] & bit) == 0) continue; /* * XXX Check for a duplicate set. This can occur because a * socket calls selrecord() twice for each poll() call * resulting in two selfds per real fd. selrescan() will * call selsetbits twice as a result. */ if ((obits[msk][idx] & bit) != 0) continue; obits[msk][idx] |= bit; n++; } return (n); } static __inline int getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp) { struct file *fp; #ifdef CAPABILITIES struct file *fp_fromcap; int error; #endif if ((fp = fget_unlocked(fdp, fd)) == NULL) return (EBADF); #ifdef CAPABILITIES /* * If the file descriptor is for a capability, test rights and use * the file descriptor references by the capability. */ error = cap_funwrap(fp, CAP_POLL_EVENT, &fp_fromcap); if (error) { fdrop(fp, curthread); return (error); } if (fp != fp_fromcap) { fhold(fp_fromcap); fdrop(fp, curthread); fp = fp_fromcap; } #endif /* CAPABILITIES */ *fpp = fp; return (0); } /* * Traverse the list of fds attached to this thread's seltd and check for * completion. */ static int selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) { struct filedesc *fdp; struct selinfo *si; struct seltd *stp; struct selfd *sfp; struct selfd *sfn; struct file *fp; fd_mask bit; int fd, ev, n, idx; int error; fdp = td->td_proc->p_fd; stp = td->td_sel; n = 0; STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (int)(uintptr_t)sfp->sf_cookie; si = sfp->sf_si; selfdfree(stp, sfp); /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; error = getselfd_cap(fdp, fd, &fp); if (error) return (error); idx = fd / NFDBITS; bit = (fd_mask)1 << (fd % NFDBITS); ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); fdrop(fp, td); if (ev != 0) n += selsetbits(ibits, obits, idx, bit, ev); } stp->st_flags = 0; td->td_retval[0] = n; return (0); } /* * Perform the initial filedescriptor scan and register ourselves with * each selinfo. */ static int selscan(td, ibits, obits, nfd) struct thread *td; fd_mask **ibits, **obits; int nfd; { struct filedesc *fdp; struct file *fp; fd_mask bit; int ev, flags, end, fd; int n, idx; int error; fdp = td->td_proc->p_fd; n = 0; for (idx = 0, fd = 0; fd < nfd; idx++) { end = imin(fd + NFDBITS, nfd); for (bit = 1; fd < end; bit <<= 1, fd++) { /* Compute the list of events we're interested in. */ flags = selflags(ibits, idx, bit); if (flags == 0) continue; error = getselfd_cap(fdp, fd, &fp); if (error) return (error); selfdalloc(td, (void *)(uintptr_t)fd); ev = fo_poll(fp, flags, td->td_ucred, td); fdrop(fp, td); if (ev != 0) n += selsetbits(ibits, obits, idx, bit, ev); } } td->td_retval[0] = n; return (0); } #ifndef _SYS_SYSPROTO_H_ struct poll_args { struct pollfd *fds; u_int nfds; int timeout; }; #endif int sys_poll(td, uap) struct thread *td; struct poll_args *uap; { struct pollfd *bits; struct pollfd smallbits[32]; - struct timeval atv, rtv, ttv; - int error = 0, timo; + struct bintime abt, rbt; + struct timeval atv; + int error, timo; u_int nfds; size_t ni; nfds = uap->nfds; if (nfds > maxfilesperproc && nfds > FD_SETSIZE) return (EINVAL); ni = nfds * sizeof(struct pollfd); if (ni > sizeof(smallbits)) bits = malloc(ni, M_TEMP, M_WAITOK); else bits = smallbits; error = copyin(uap->fds, bits, ni); if (error) goto done; if (uap->timeout != INFTIM) { atv.tv_sec = uap->timeout / 1000; atv.tv_usec = (uap->timeout % 1000) * 1000; - if (itimerfix(&atv)) { + if (atv.tv_sec < 0 || atv.tv_usec < 0 || + atv.tv_usec >= 1000000) { error = EINVAL; goto done; } - getmicrouptime(&rtv); - timevaladd(&atv, &rtv); + binuptime(&rbt); + timeval2bintime(&atv, &abt); + bintime_add(&abt, &rbt); } else { - atv.tv_sec = 0; - atv.tv_usec = 0; + abt.sec = 0; + abt.frac = 0; } - timo = 0; seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = pollscan(td, bits, nfds); if (error || td->td_retval[0] != 0) break; - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) + if (abt.sec || abt.frac) { + binuptime(&rbt); + if (bintime_cmp(&rbt, &abt, >=)) break; - ttv = atv; - timevalsub(&ttv, &rtv); - timo = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz(&ttv); + error = seltdwait(td, &abt, 0); + } else { + timo = 0; + error = seltdwait(td, NULL, timo); } - error = seltdwait(td, timo); if (error) break; error = pollrescan(td); if (error || td->td_retval[0] != 0) break; } seltdclear(td); done: /* poll is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; if (error == 0) { error = pollout(td, bits, uap->fds, nfds); if (error) goto out; } out: if (ni > sizeof(smallbits)) free(bits, M_TEMP); return (error); } static int pollrescan(struct thread *td) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; struct selinfo *si; struct filedesc *fdp; struct file *fp; struct pollfd *fd; int n; n = 0; fdp = td->td_proc->p_fd; stp = td->td_sel; FILEDESC_SLOCK(fdp); STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (struct pollfd *)sfp->sf_cookie; si = sfp->sf_si; selfdfree(stp, sfp); /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; fp = fdp->fd_ofiles[fd->fd]; #ifdef CAPABILITIES if ((fp == NULL) || (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) { #else if (fp == NULL) { #endif fd->revents = POLLNVAL; n++; continue; } /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); if (fd->revents != 0) n++; } FILEDESC_SUNLOCK(fdp); stp->st_flags = 0; td->td_retval[0] = n; return (0); } static int pollout(td, fds, ufds, nfd) struct thread *td; struct pollfd *fds; struct pollfd *ufds; u_int nfd; { int error = 0; u_int i = 0; u_int n = 0; for (i = 0; i < nfd; i++) { error = copyout(&fds->revents, &ufds->revents, sizeof(ufds->revents)); if (error) return (error); if (fds->revents != 0) n++; fds++; ufds++; } td->td_retval[0] = n; return (0); } static int pollscan(td, fds, nfd) struct thread *td; struct pollfd *fds; u_int nfd; { struct filedesc *fdp = td->td_proc->p_fd; int i; struct file *fp; int n = 0; FILEDESC_SLOCK(fdp); for (i = 0; i < nfd; i++, fds++) { if (fds->fd >= fdp->fd_nfiles) { fds->revents = POLLNVAL; n++; } else if (fds->fd < 0) { fds->revents = 0; } else { fp = fdp->fd_ofiles[fds->fd]; #ifdef CAPABILITIES if ((fp == NULL) || (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) { #else if (fp == NULL) { #endif fds->revents = POLLNVAL; n++; } else { /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ selfdalloc(td, fds); fds->revents = fo_poll(fp, fds->events, td->td_ucred, td); /* * POSIX requires POLLOUT to be never * set simultaneously with POLLHUP. */ if ((fds->revents & POLLHUP) != 0) fds->revents &= ~POLLOUT; if (fds->revents != 0) n++; } } } FILEDESC_SUNLOCK(fdp); td->td_retval[0] = n; return (0); } /* * OpenBSD poll system call. * * XXX this isn't quite a true representation.. OpenBSD uses select ops. */ #ifndef _SYS_SYSPROTO_H_ struct openbsd_poll_args { struct pollfd *fds; u_int nfds; int timeout; }; #endif int sys_openbsd_poll(td, uap) register struct thread *td; register struct openbsd_poll_args *uap; { return (sys_poll(td, (struct poll_args *)uap)); } /* * XXX This was created specifically to support netncp and netsmb. This * allows the caller to specify a socket to wait for events on. It returns * 0 if any events matched and an error otherwise. There is no way to * determine which events fired. */ int selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) { struct timeval atv, rtv, ttv; int error, timo; if (tvp != NULL) { atv = *tvp; if (itimerfix(&atv)) return (EINVAL); getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } timo = 0; seltdinit(td); /* * Iterate until the timeout expires or the socket becomes ready. */ for (;;) { selfdalloc(td, NULL); error = sopoll(so, events, NULL, td); /* error here is actually the ready events. */ if (error) return (0); if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) { seltdclear(td); return (EWOULDBLOCK); } ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } - error = seltdwait(td, timo); + error = seltdwait(td, NULL, timo); seltdclear(td); if (error) break; } /* XXX Duplicates ncp/smb behavior. */ if (error == ERESTART) error = 0; return (error); } /* * Preallocate two selfds associated with 'cookie'. Some fo_poll routines * have two select sets, one for read and another for write. */ static void selfdalloc(struct thread *td, void *cookie) { struct seltd *stp; stp = td->td_sel; if (stp->st_free1 == NULL) stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); stp->st_free1->sf_td = stp; stp->st_free1->sf_cookie = cookie; if (stp->st_free2 == NULL) stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); stp->st_free2->sf_td = stp; stp->st_free2->sf_cookie = cookie; } static void selfdfree(struct seltd *stp, struct selfd *sfp) { STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); mtx_lock(sfp->sf_mtx); if (sfp->sf_si) TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); mtx_unlock(sfp->sf_mtx); uma_zfree(selfd_zone, sfp); } /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ void seldrain(sip) struct selinfo *sip; { /* * This feature is already provided by doselwakeup(), thus it is * enough to go for it. * Eventually, the context, should take care to avoid races * between thread calling select()/poll() and file descriptor * detaching, but, again, the races are just the same as * selwakeup(). */ doselwakeup(sip, -1); } /* * Record a select request. */ void selrecord(selector, sip) struct thread *selector; struct selinfo *sip; { struct selfd *sfp; struct seltd *stp; struct mtx *mtxp; stp = selector->td_sel; /* * Don't record when doing a rescan. */ if (stp->st_flags & SELTD_RESCAN) return; /* * Grab one of the preallocated descriptors. */ sfp = NULL; if ((sfp = stp->st_free1) != NULL) stp->st_free1 = NULL; else if ((sfp = stp->st_free2) != NULL) stp->st_free2 = NULL; else panic("selrecord: No free selfd on selq"); mtxp = sip->si_mtx; if (mtxp == NULL) mtxp = mtx_pool_find(mtxpool_select, sip); /* * Initialize the sfp and queue it in the thread. */ sfp->sf_si = sip; sfp->sf_mtx = mtxp; STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); /* * Now that we've locked the sip, check for initialization. */ mtx_lock(mtxp); if (sip->si_mtx == NULL) { sip->si_mtx = mtxp; TAILQ_INIT(&sip->si_tdlist); } /* * Add this thread to the list of selfds listening on this selinfo. */ TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); mtx_unlock(sip->si_mtx); } /* Wake up a selecting thread. */ void selwakeup(sip) struct selinfo *sip; { doselwakeup(sip, -1); } /* Wake up a selecting thread, and set its priority. */ void selwakeuppri(sip, pri) struct selinfo *sip; int pri; { doselwakeup(sip, pri); } /* * Do a wakeup when a selectable event occurs. */ static void doselwakeup(sip, pri) struct selinfo *sip; int pri; { struct selfd *sfp; struct selfd *sfn; struct seltd *stp; /* If it's not initialized there can't be any waiters. */ if (sip->si_mtx == NULL) return; /* * Locking the selinfo locks all selfds associated with it. */ mtx_lock(sip->si_mtx); TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { /* * Once we remove this sfp from the list and clear the * sf_si seltdclear will know to ignore this si. */ TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); sfp->sf_si = NULL; stp = sfp->sf_td; mtx_lock(&stp->st_mtx); stp->st_flags |= SELTD_PENDING; cv_broadcastpri(&stp->st_wait, pri); mtx_unlock(&stp->st_mtx); } mtx_unlock(sip->si_mtx); } static void seltdinit(struct thread *td) { struct seltd *stp; if ((stp = td->td_sel) != NULL) goto out; td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); cv_init(&stp->st_wait, "select"); out: stp->st_flags = 0; STAILQ_INIT(&stp->st_selq); } static int -seltdwait(struct thread *td, int timo) +seltdwait(struct thread *td, struct bintime *bt, int timo) { struct seltd *stp; int error; stp = td->td_sel; /* * An event of interest may occur while we do not hold the seltd * locked so check the pending flag before we sleep. */ mtx_lock(&stp->st_mtx); /* * Any further calls to selrecord will be a rescan. */ stp->st_flags |= SELTD_RESCAN; if (stp->st_flags & SELTD_PENDING) { mtx_unlock(&stp->st_mtx); return (0); } - if (timo > 0) + if (bt == NULL && timo > 0) error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); - else + else if (bt != NULL) + error = cv_timedwait_bt_sig(&stp->st_wait, &stp->st_mtx, *bt); + else error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); mtx_unlock(&stp->st_mtx); return (error); } void seltdfini(struct thread *td) { struct seltd *stp; stp = td->td_sel; if (stp == NULL) return; if (stp->st_free1) uma_zfree(selfd_zone, stp->st_free1); if (stp->st_free2) uma_zfree(selfd_zone, stp->st_free2); td->td_sel = NULL; free(stp, M_SELECT); } /* * Remove the references to the thread from all of the objects we were * polling. */ static void seltdclear(struct thread *td) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; stp = td->td_sel; STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) selfdfree(stp, sfp); stp->st_flags = 0; } static void selectinit(void *); SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); static void selectinit(void *dummy __unused) { selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); } Index: projects/calloutng/sys/sys/condvar.h =================================================================== --- projects/calloutng/sys/sys/condvar.h (revision 237201) +++ projects/calloutng/sys/sys/condvar.h (revision 237202) @@ -1,81 +1,84 @@ /*- * Copyright (c) 2000 Jake Burkholder . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_CONDVAR_H_ #define _SYS_CONDVAR_H_ #ifndef LOCORE #include struct lock_object; struct thread; TAILQ_HEAD(cv_waitq, thread); /* * Condition variable. The waiters count is protected by the mutex that * protects the condition; that is, the mutex that is passed to cv_wait*() * and is held across calls to cv_signal() and cv_broadcast(). It is an * optimization to avoid looking up the sleep queue if there are no waiters. */ struct cv { const char *cv_description; int cv_waiters; }; #ifdef _KERNEL void cv_init(struct cv *cvp, const char *desc); void cv_destroy(struct cv *cvp); void _cv_wait(struct cv *cvp, struct lock_object *lock); void _cv_wait_unlock(struct cv *cvp, struct lock_object *lock); int _cv_wait_sig(struct cv *cvp, struct lock_object *lock); int _cv_timedwait(struct cv *cvp, struct lock_object *lock, int timo); -int _cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, int timo); +int _cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, + struct bintime *bt, int timo); void cv_signal(struct cv *cvp); void cv_broadcastpri(struct cv *cvp, int pri); #define cv_wait(cvp, lock) \ _cv_wait((cvp), &(lock)->lock_object) #define cv_wait_unlock(cvp, lock) \ _cv_wait_unlock((cvp), &(lock)->lock_object) #define cv_wait_sig(cvp, lock) \ _cv_wait_sig((cvp), &(lock)->lock_object) #define cv_timedwait(cvp, lock, timo) \ _cv_timedwait((cvp), &(lock)->lock_object, (timo)) #define cv_timedwait_sig(cvp, lock, timo) \ - _cv_timedwait_sig((cvp), &(lock)->lock_object, (timo)) + _cv_timedwait_sig((cvp), &(lock)->lock_object, (NULL), (timo)) +#define cv_timedwait_bt_sig(cvp, lock, bt) \ + _cv_timedwait_sig((cvp), &(lock)->lock_object, (&bt), (0)) #define cv_broadcast(cvp) cv_broadcastpri(cvp, 0) #define cv_wmesg(cvp) ((cvp)->cv_description) #endif /* _KERNEL */ #endif /* !LOCORE */ #endif /* _SYS_CONDVAR_H_ */ Index: projects/calloutng/sys/sys/sleepqueue.h =================================================================== --- projects/calloutng/sys/sys/sleepqueue.h (revision 237201) +++ projects/calloutng/sys/sys/sleepqueue.h (revision 237202) @@ -1,121 +1,124 @@ /*- * Copyright (c) 2004 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_SLEEPQUEUE_H_ #define _SYS_SLEEPQUEUE_H_ /* * Sleep queue interface. Sleep/wakeup, condition variables, and sx * locks use a sleep queue for the queue of threads blocked on a sleep * channel. * * A thread calls sleepq_lock() to lock the sleep queue chain associated * with a given wait channel. A thread can then call call sleepq_add() to * add themself onto a sleep queue and call one of the sleepq_wait() * functions to actually go to sleep. If a thread needs to abort a sleep * operation it should call sleepq_release() to unlock the associated sleep * queue chain lock. If the thread also needs to remove itself from a queue * it just enqueued itself on, it can use sleepq_remove() instead. * * If the thread only wishes to sleep for a limited amount of time, it can * call sleepq_set_timeout() after sleepq_add() to setup a timeout. It * should then use one of the sleepq_timedwait() functions to block. * * If the thread wants the sleep to be interruptible by signals, it can * call sleepq_catch_signals() after sleepq_add(). It should then use * one of the sleepq_wait_sig() functions to block. After the thread has * been resumed, it should call sleepq_calc_signal_retval() to determine * if it should return EINTR or ERESTART passing in the value returned from * the earlier call to sleepq_catch_signals(). * * A thread is normally resumed from a sleep queue by either the * sleepq_signal() or sleepq_broadcast() functions. Sleepq_signal() wakes * the thread with the highest priority that is sleeping on the specified * wait channel. Sleepq_broadcast() wakes all threads that are sleeping * on the specified wait channel. A thread sleeping in an interruptible * sleep can be interrupted by calling sleepq_abort(). A thread can also * be removed from a specified sleep queue using the sleepq_remove() * function. Note that the sleep queue chain must first be locked via * sleepq_lock() before calling sleepq_abort(), sleepq_broadcast(), or * sleepq_signal(). These routines each return a boolean that will be true * if at least one swapped-out thread was resumed. In that case, the caller * is responsible for waking up the swapper by calling kick_proc0() after * releasing the sleep queue chain lock. * * Each thread allocates a sleep queue at thread creation via sleepq_alloc() * and releases it at thread destruction via sleepq_free(). Note that * a sleep queue is not tied to a specific thread and that the sleep queue * released at thread destruction may not be the same sleep queue that the * thread allocated when it was created. * * XXX: Some other parts of the kernel such as ithread sleeping may end up * using this interface as well (death to TDI_IWAIT!) */ struct lock_object; struct sleepqueue; struct thread; #ifdef _KERNEL #define SLEEPQ_TYPE 0x0ff /* Mask of sleep queue types. */ #define SLEEPQ_SLEEP 0x00 /* Used by sleep/wakeup. */ #define SLEEPQ_CONDVAR 0x01 /* Used for a cv. */ #define SLEEPQ_PAUSE 0x02 /* Used by pause. */ #define SLEEPQ_SX 0x03 /* Used by an sx lock. */ #define SLEEPQ_LK 0x04 /* Used by a lockmgr. */ #define SLEEPQ_INTERRUPTIBLE 0x100 /* Sleep is interruptible. */ #define SLEEPQ_STOP_ON_BDRY 0x200 /* Stop sleeping thread on user mode boundary */ void init_sleepqueues(void); int sleepq_abort(struct thread *td, int intrval); void sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags, int queue); struct sleepqueue *sleepq_alloc(void); int sleepq_broadcast(void *wchan, int flags, int pri, int queue); void sleepq_free(struct sleepqueue *sq); void sleepq_lock(void *wchan); struct sleepqueue *sleepq_lookup(void *wchan); void sleepq_release(void *wchan); void sleepq_remove(struct thread *td, void *wchan); int sleepq_signal(void *wchan, int flags, int pri, int queue); -void sleepq_set_timeout_bt(void *wchan, struct bintime bt); -void sleepq_set_timeout(void *wchan, int timo); +void _sleepq_set_timeout(void *wchan, struct bintime *bt, int timo); +#define sleepq_set_timeout(wchan, timo) \ + _sleepq_set_timeout((wchan), (NULL), (timo)) +#define sleepq_set_timeout_bt(wchan, bt) \ + _sleepq_set_timeout((wchan), (&bt), (0)) u_int sleepq_sleepcnt(void *wchan, int queue); int sleepq_timedwait(void *wchan, int pri); int sleepq_timedwait_sig(void *wchan, int pri); int sleepq_type(void *wchan); void sleepq_wait(void *wchan, int pri); int sleepq_wait_sig(void *wchan, int pri); #endif /* _KERNEL */ #endif /* !_SYS_SLEEPQUEUE_H_ */