diff --git a/share/man/man9/condvar.9 b/share/man/man9/condvar.9 index 4a4e8741eda0..3e02f2fa40c0 100644 --- a/share/man/man9/condvar.9 +++ b/share/man/man9/condvar.9 @@ -1,261 +1,261 @@ .\" .\" Copyright (C) 2000 Jason Evans . All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice(s), this list of conditions and the following disclaimer as .\" the first lines of this file unmodified other than the possible .\" addition of one or more copyright notices. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice(s), this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED .\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE .\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY .\" DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES .\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR .\" SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER .\" CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH .\" DAMAGE. .\" .\" $FreeBSD$ .\" .Dd February 19, 2013 .Dt CONDVAR 9 .Os .Sh NAME .Nm condvar , .Nm cv_init , .Nm cv_destroy , .Nm cv_wait , .Nm cv_wait_sig , .Nm cv_wait_unlock , .Nm cv_timedwait , .Nm cv_timedwait_sbt , .Nm cv_timedwait_sig , .Nm cv_timedwait_sig_sbt , .Nm cv_signal , .Nm cv_broadcast , .Nm cv_broadcastpri , .Nm cv_wmesg .Nd kernel condition variable .Sh SYNOPSIS .In sys/param.h .In sys/proc.h .In sys/condvar.h .Ft void .Fn cv_init "struct cv *cvp" "const char *desc" .Ft void .Fn cv_destroy "struct cv *cvp" .Ft void .Fn cv_wait "struct cv *cvp" "lock" .Ft int .Fn cv_wait_sig "struct cv *cvp" "lock" .Ft void .Fn cv_wait_unlock "struct cv *cvp" "lock" .Ft int .Fn cv_timedwait "struct cv *cvp" "lock" "int timo" .Ft int .Fn cv_timedwait_sbt "struct cv *cvp" "lock" "sbintime_t sbt" \ "sbintime_t pr" "int flags" .Ft int .Fn cv_timedwait_sig "struct cv *cvp" "lock" "int timo" .Ft int .Fn cv_timedwait_sig_sbt "struct cv *cvp" "lock" "sbintime_t sbt" \ "sbintime_t pr" "int flags" .Ft void .Fn cv_signal "struct cv *cvp" .Ft void .Fn cv_broadcast "struct cv *cvp" .Ft void .Fn cv_broadcastpri "struct cv *cvp" "int pri" .Ft const char * .Fn cv_wmesg "struct cv *cvp" .Sh DESCRIPTION Condition variables are used in conjunction with mutexes to wait for conditions to occur. Condition variables are created with .Fn cv_init , where .Fa cvp is a pointer to space for a .Vt struct cv , and .Fa desc is a pointer to a null-terminated character string that describes the condition variable. Condition variables are destroyed with .Fn cv_destroy . Threads wait on condition variables by calling .Fn cv_wait , .Fn cv_wait_sig , .Fn cv_wait_unlock , .Fn cv_timedwait , or .Fn cv_timedwait_sig . Threads unblock waiters by calling .Fn cv_signal to unblock one waiter, or .Fn cv_broadcast or .Fn cv_broadcastpri to unblock all waiters. In addition to waking waiters, .Fn cv_broadcastpri ensures that all of the waiters have a priority of at least .Fa pri by raising the priority of any threads that do not. .Fn cv_wmesg returns the description string of .Fa cvp , as set by the initial call to .Fn cv_init . .Pp The .Fa lock argument is a pointer to either a .Xr mutex 9 , .Xr rwlock 9 , or .Xr sx 9 lock. A .Xr mutex 9 argument must be initialized with .Dv MTX_DEF and not .Dv MTX_SPIN . A thread must hold .Fa lock before calling .Fn cv_wait , .Fn cv_wait_sig , .Fn cv_wait_unlock , .Fn cv_timedwait , or .Fn cv_timedwait_sig . When a thread waits on a condition, .Fa lock is atomically released before the thread is blocked, then reacquired before the function call returns. In addition, the thread will fully drop the .Va Giant mutex (even if recursed) while the it is suspended and will reacquire the .Va Giant mutex before the function returns. The .Fn cv_wait_unlock function does not reacquire the lock before returning. Note that the .Va Giant mutex may be specified as .Fa lock . However, .Va Giant may not be used as .Fa lock for the .Fn cv_wait_unlock function. All waiters must pass the same .Fa lock in conjunction with .Fa cvp . .Pp When .Fn cv_wait , .Fn cv_wait_sig , .Fn cv_wait_unlock , .Fn cv_timedwait , and .Fn cv_timedwait_sig unblock, their calling threads are made runnable. .Fn cv_timedwait and .Fn cv_timedwait_sig wait for at most .Fa timo / .Dv HZ seconds before being unblocked and returning .Er EWOULDBLOCK ; otherwise, they return 0. .Fn cv_wait_sig and .Fn cv_timedwait_sig return prematurely with a value of .Er EINTR or .Er ERESTART if a signal is caught, or 0 if signaled via .Fn cv_signal or .Fn cv_broadcast . .Pp .Fn cv_timedwait_sbt and .Fn cv_timedwait_sig_sbt functions take .Fa sbt argument instead of .Fa timo . It allows to specify relative or absolute unblock time with higher resolution in form of .Vt sbintime_t . The parameter .Fa pr allows to specify wanted absolute event precision. The parameter .Fa flags allows to pass additional .Fn callout_reset_sbt flags. .Sh RETURN VALUES If successful, .Fn cv_wait_sig , .Fn cv_timedwait , and .Fn cv_timedwait_sig return 0. Otherwise, a non-zero error code is returned. .Pp .Fn cv_wmesg returns the description string that was passed to .Fn cv_init . .Sh ERRORS .Fn cv_wait_sig and .Fn cv_timedwait_sig will fail if: .Bl -tag -width Er .It Bq Er EINTR A signal was caught and the system call should be interrupted. .It Bq Er ERESTART A signal was caught and the system call should be restarted. .El .Pp .Fn cv_timedwait and .Fn cv_timedwait_sig will fail if: .Bl -tag -width Er .It Bq Er EWOULDBLOCK Timeout expired. .El .Sh SEE ALSO +.Xr callout 9 , .Xr locking 9 , .Xr mtx_pool 9 , .Xr mutex 9 , .Xr rwlock 9 , .Xr sema 9 , .Xr sleep 9 , -.Xr sx 9 , -.Xr timeout 9 +.Xr sx 9 diff --git a/share/man/man9/epoch.9 b/share/man/man9/epoch.9 index 1f191211b041..8a5008a6b238 100644 --- a/share/man/man9/epoch.9 +++ b/share/man/man9/epoch.9 @@ -1,296 +1,296 @@ .\" .\" Copyright (C) 2018 Matthew Macy . .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice(s), this list of conditions and the following disclaimer as .\" the first lines of this file unmodified other than the possible .\" addition of one or more copyright notices. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice(s), this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED .\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE .\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY .\" DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES .\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR .\" SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER .\" CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH .\" DAMAGE. .\" .\" $FreeBSD$ .\" .Dd April 30, 2020 .Dt EPOCH 9 .Os .Sh NAME .Nm epoch , .Nm epoch_context , .Nm epoch_alloc , .Nm epoch_free , .Nm epoch_enter , .Nm epoch_exit , .Nm epoch_wait , .Nm epoch_enter_preempt , .Nm epoch_exit_preempt , .Nm epoch_wait_preempt , .Nm epoch_call , .Nm epoch_drain_callbacks , .Nm in_epoch , .Nm in_epoch_verbose .Nd kernel epoch based reclamation .Sh SYNOPSIS .In sys/param.h .In sys/proc.h .In sys/epoch.h .\" Types .Bd -literal struct epoch; /* Opaque */ .Ed .Vt typedef "struct epoch *epoch_t" ; .Bd -literal struct epoch_context { void *data[2]; }; .Ed .Vt typedef "struct epoch_context *epoch_context_t" ; .Vt typedef "void epoch_callback_t(epoch_context_t)" ; .Bd -literal struct epoch_tracker; /* Opaque */ .Ed .Vt typedef "struct epoch_tracker *epoch_tracker_t" ; .\" Declarations .Ft epoch_t .Fn epoch_alloc "const char *name" "int flags" .Ft void .Fn epoch_free "epoch_t epoch" .Ft void .Fn epoch_enter "epoch_t epoch" .Ft void .Fn epoch_exit "epoch_t epoch" .Ft void .Fn epoch_wait "epoch_t epoch" .Ft void .Fn epoch_enter_preempt "epoch_t epoch" "epoch_tracker_t et" .Ft void .Fn epoch_exit_preempt "epoch_t epoch" "epoch_tracker_t et" .Ft void .Fn epoch_wait_preempt "epoch_t epoch" .Ft void .Fn epoch_call "epoch_t epoch" "epoch_callback_t callback" "epoch_context_t ctx" .Ft void .Fn epoch_drain_callbacks "epoch_t epoch" .Ft int .Fn in_epoch "epoch_t epoch" .Ft int .Fn in_epoch_verbose "epoch_t epoch" "int dump_onfail" .Sh DESCRIPTION Epochs are used to guarantee liveness and immutability of data by deferring reclamation and mutation until a grace period has elapsed. Epochs do not have any lock ordering issues. Entering and leaving an epoch section will never block. .Pp Epochs are allocated with .Fn epoch_alloc . The .Fa name argument is used for debugging convenience when the .Cd EPOCH_TRACE kernel option is configured. By default, epochs do not allow preemption during sections. By default mutexes cannot be held across .Fn epoch_wait_preempt . The .Fa flags specified are formed by .Em OR Ns 'ing the following values: .Bl -tag -offset indent -width Ds .It Dv EPOCH_LOCKED Permit holding mutexes across .Fn epoch_wait_preempt (requires .Dv EPOCH_PREEMPT ) . When doing this one must be cautious of creating a situation where a deadlock is possible. .It Dv EPOCH_PREEMPT The .Vt epoch will allow preemption during sections. Only non-sleepable locks may be acquired during a preemptible epoch. The functions .Fn epoch_enter_preempt , .Fn epoch_exit_preempt , and .Fn epoch_wait_preempt must be used in place of .Fn epoch_enter , .Fn epoch_exit , and .Fn epoch_wait , respectively. .El .Pp .Vt epoch Ns s are freed with .Fn epoch_free . .Pp Threads indicate the start of an epoch critical section by calling .Fn epoch_enter (or .Fn epoch_enter_preempt for preemptible epochs). Threads call .Fn epoch_exit (or .Fn epoch_exit_preempt for preemptible epochs) to indicate the end of a critical section. .Vt struct epoch_tracker Ns s are stack objects whose pointers are passed to .Fn epoch_enter_preempt and .Fn epoch_exit_preempt (much like .Vt struct rm_priotracker ) . .Pp Threads can defer work until a grace period has expired since any thread has entered the epoch either synchronously or asynchronously. .Fn epoch_call defers work asynchronously by invoking the provided .Fa callback at a later time. .Fn epoch_wait (or .Fn epoch_wait_preempt ) blocks the current thread until the grace period has expired and the work can be done safely. .Pp Default, non-preemptible epoch wait .Fn ( epoch_wait ) is guaranteed to have much shorter completion times relative to preemptible epoch wait .Fn ( epoch_wait_preempt ) . (In the default type, none of the threads in an epoch section will be preempted before completing its section.) .Pp INVARIANTS can assert that a thread is in an epoch by using .Fn in_epoch . .Fn in_epoch "epoch" is equivalent to invoking .Fn in_epoch_verbose "epoch" "0" . If .Cd EPOCH_TRACE is enabled, .Fn in_epoch_verbose "epoch" "1" provides additional verbose debugging information. .Pp The epoch API currently does not support sleeping in epoch_preempt sections. A caller should never call .Fn epoch_wait in the middle of an epoch section for the same epoch as this will lead to a deadlock. .Pp The .Fn epoch_drain_callbacks function is used to drain all pending callbacks which have been invoked by prior .Fn epoch_call function calls on the same epoch. This function is useful when there are shared memory structure(s) referred to by the epoch callback(s) which are not refcounted and are rarely freed. The typical place for calling this function is right before freeing or invalidating the shared resource(s) used by the epoch callback(s). This function can sleep and is not optimized for performance. .Sh RETURN VALUES .Fn in_epoch curepoch will return 1 if curthread is in curepoch, 0 otherwise. .Sh EXAMPLES Async free example: Thread 1: .Bd -literal int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_laddr *laddr, struct ucred *cred) { /* ... */ epoch_enter(net_epoch); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } epoch_exit(net_epoch); /* ... */ } .Ed Thread 2: .Bd -literal void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) epoch_call(net_epoch, ifa_destroy, &ifa->ifa_epoch_ctx); } void if_purgeaddrs(struct ifnet *ifp) { /* .... * IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } .Ed .Pp Thread 1 traverses the ifaddr list in an epoch. Thread 2 unlinks with the corresponding epoch safe macro, marks as logically free, and then defers deletion. More general mutation or a synchronous free would have to follow a call to .Fn epoch_wait . .Sh NOTES The .Nm kernel programming interface is under development and is subject to change. .Sh SEE ALSO +.Xr callout 9 , .Xr locking 9 , .Xr mtx_pool 9 , .Xr mutex 9 , .Xr rwlock 9 , .Xr sema 9 , .Xr sleep 9 , -.Xr sx 9 , -.Xr timeout 9 +.Xr sx 9 .Sh HISTORY The .Nm framework first appeared in .Fx 11.0 . .Sh CAVEATS One must be cautious when using .Fn epoch_wait_preempt . Threads are pinned during epoch sections, so if a thread in a section is then preempted by a higher priority compute bound thread on that CPU, it can be prevented from leaving the section indefinitely. .Pp Epochs are not a straight replacement for read locks. Callers must use safe list and tailq traversal routines in an epoch (see ck_queue). When modifying a list referenced from an epoch section safe removal routines must be used and the caller can no longer modify a list entry in place. An item to be modified must be handled with copy on write and frees must be deferred until after a grace period has elapsed. diff --git a/share/man/man9/locking.9 b/share/man/man9/locking.9 index e1e4ccd33664..64cba3b0159b 100644 --- a/share/man/man9/locking.9 +++ b/share/man/man9/locking.9 @@ -1,439 +1,439 @@ .\" Copyright (c) 2007 Julian Elischer (julian - freebsd org ) .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd February 3, 2023 .Dt LOCKING 9 .Os .Sh NAME .Nm locking .Nd kernel synchronization primitives .Sh DESCRIPTION The .Em FreeBSD kernel is written to run across multiple CPUs and as such provides several different synchronization primitives to allow developers to safely access and manipulate many data types. .Ss Mutexes Mutexes (also called "blocking mutexes") are the most commonly used synchronization primitive in the kernel. A thread acquires (locks) a mutex before accessing data shared with other threads (including interrupt threads), and releases (unlocks) it afterwards. If the mutex cannot be acquired, the thread requesting it will wait. Mutexes are adaptive by default, meaning that if the owner of a contended mutex is currently running on another CPU, then a thread attempting to acquire the mutex will spin rather than yielding the processor. Mutexes fully support priority propagation. .Pp See .Xr mutex 9 for details. .Ss Spin Mutexes Spin mutexes are a variation of basic mutexes; the main difference between the two is that spin mutexes never block. Instead, they spin while waiting for the lock to be released. To avoid deadlock, a thread that holds a spin mutex must never yield its CPU. Unlike ordinary mutexes, spin mutexes disable interrupts when acquired. Since disabling interrupts can be expensive, they are generally slower to acquire and release. Spin mutexes should be used only when absolutely necessary, e.g. to protect data shared with interrupt filter code (see .Xr bus_setup_intr 9 for details), or for scheduler internals. .Ss Mutex Pools With most synchronization primitives, such as mutexes, the programmer must provide memory to hold the primitive. For example, a mutex may be embedded inside the structure it protects. Mutex pools provide a preallocated set of mutexes to avoid this requirement. Note that mutexes from a pool may only be used as leaf locks. .Pp See .Xr mtx_pool 9 for details. .Ss Reader/Writer Locks Reader/writer locks allow shared access to protected data by multiple threads or exclusive access by a single thread. The threads with shared access are known as .Em readers since they should only read the protected data. A thread with exclusive access is known as a .Em writer since it may modify protected data. .Pp Reader/writer locks can be treated as mutexes (see above and .Xr mutex 9 ) with shared/exclusive semantics. Reader/writer locks support priority propagation like mutexes, but priority is propagated only to an exclusive holder. This limitation comes from the fact that shared owners are anonymous. .Pp See .Xr rwlock 9 for details. .Ss Read-Mostly Locks Read-mostly locks are similar to .Em reader/writer locks but optimized for very infrequent write locking. .Em Read-mostly locks implement full priority propagation by tracking shared owners using a caller-supplied .Em tracker data structure. .Pp See .Xr rmlock 9 for details. .Ss Sleepable Read-Mostly Locks Sleepable read-mostly locks are a variation on read-mostly locks. Threads holding an exclusive lock may sleep, but threads holding a shared lock may not. Priority is propagated to shared owners but not to exclusive owners. .Ss Shared/exclusive locks Shared/exclusive locks are similar to reader/writer locks; the main difference between them is that shared/exclusive locks may be held during unbounded sleep. Acquiring a contested shared/exclusive lock can perform an unbounded sleep. These locks do not support priority propagation. .Pp See .Xr sx 9 for details. .Ss Lockmanager locks Lockmanager locks are sleepable shared/exclusive locks used mostly in .Xr VFS 9 .Po as a .Xr vnode 9 lock .Pc and in the buffer cache .Po .Xr BUF_LOCK 9 .Pc . They have features other lock types do not have such as sleep timeouts, blocking upgrades, writer starvation avoidance, draining, and an interlock mutex, but this makes them complicated both to use and to implement; for this reason, they should be avoided. .Pp See .Xr lock 9 for details. .Ss Non-blocking synchronization The kernel has two facilities, .Xr epoch 9 and .Xr smr 9 , which can be used to provide read-only access to a data structure while one or more writers are concurrently modifying the data structure. Specifically, readers using .Xr epoch 9 and .Xr smr 9 to synchronize accesses do not block writers, in contrast with reader/writer locks, and they help ensure that memory freed by writers is not reused until all readers which may be accessing it have finished. Thus, they are a useful building block in the construction of lock-free data structures. .Pp These facilities are difficult to use correctly and should be avoided in preference to traditional mutual exclusion-based synchronization, except when performance or non-blocking guarantees are a major concern. .Pp See .Xr epoch 9 and .Xr smr 9 for details. .Ss Counting semaphores Counting semaphores provide a mechanism for synchronizing access to a pool of resources. Unlike mutexes, semaphores do not have the concept of an owner, so they can be useful in situations where one thread needs to acquire a resource, and another thread needs to release it. They are largely deprecated. .Pp See .Xr sema 9 for details. .Ss Condition variables Condition variables are used in conjunction with locks to wait for a condition to become true. A thread must hold the associated lock before calling one of the .Fn cv_wait , functions. When a thread waits on a condition, the lock is atomically released before the thread yields the processor and reacquired before the function call returns. Condition variables may be used with blocking mutexes, reader/writer locks, read-mostly locks, and shared/exclusive locks. .Pp See .Xr condvar 9 for details. .Ss Sleep/Wakeup The functions .Fn tsleep , .Fn msleep , .Fn msleep_spin , .Fn pause , .Fn wakeup , and .Fn wakeup_one also handle event-based thread blocking. Unlike condition variables, arbitrary addresses may be used as wait channels and a dedicated structure does not need to be allocated. However, care must be taken to ensure that wait channel addresses are unique to an event. If a thread must wait for an external event, it is put to sleep by .Fn tsleep , .Fn msleep , .Fn msleep_spin , or .Fn pause . Threads may also wait using one of the locking primitive sleep routines .Xr mtx_sleep 9 , .Xr rw_sleep 9 , or .Xr sx_sleep 9 . .Pp The parameter .Fa chan is an arbitrary address that uniquely identifies the event on which the thread is being put to sleep. All threads sleeping on a single .Fa chan are woken up later by .Fn wakeup .Pq often called from inside an interrupt routine to indicate that the event the thread was blocking on has occurred. .Pp Several of the sleep functions including .Fn msleep , .Fn msleep_spin , and the locking primitive sleep routines specify an additional lock parameter. The lock will be released before sleeping and reacquired before the sleep routine returns. If .Fa priority includes the .Dv PDROP flag, then the lock will not be reacquired before returning. The lock is used to ensure that a condition can be checked atomically, and that the current thread can be suspended without missing a change to the condition or an associated wakeup. In addition, all of the sleep routines will fully drop the .Va Giant mutex .Pq even if recursed while the thread is suspended and will reacquire the .Va Giant mutex .Pq restoring any recursion before the function returns. .Pp The .Fn pause function is a special sleep function that waits for a specified amount of time to pass before the thread resumes execution. This sleep cannot be terminated early by either an explicit .Fn wakeup or a signal. .Pp See .Xr sleep 9 for details. .Ss Giant Giant is a special mutex used to protect data structures that do not yet have their own locks. Since it provides semantics akin to the old .Xr spl 9 interface, Giant has special characteristics: .Bl -enum .It It is recursive. .It Drivers can request that Giant be locked around them by not marking themselves MPSAFE. Note that infrastructure to do this is slowly going away as non-MPSAFE drivers either became properly locked or disappear. .It Giant must be locked before other non-sleepable locks. .It Giant is dropped during unbounded sleeps and reacquired after wakeup. .It There are places in the kernel that drop Giant and pick it back up again. Sleep locks will do this before sleeping. Parts of the network or VM code may do this as well. This means that you cannot count on Giant keeping other code from running if your code sleeps, even if you want it to. .El .Sh INTERACTIONS The primitives can interact and have a number of rules regarding how they can and can not be combined. Many of these rules are checked by .Xr witness 4 . .Ss Bounded vs. Unbounded Sleep In a bounded sleep .Po also referred to as .Dq blocking .Pc the only resource needed to resume execution of a thread is CPU time for the owner of a lock that the thread is waiting to acquire. In an unbounded sleep .Po often referred to as simply .Dq sleeping .Pc a thread waits for an external event or for a condition to become true. In particular, a dependency chain of threads in bounded sleeps should always make forward progress, since there is always CPU time available. This requires that no thread in a bounded sleep is waiting for a lock held by a thread in an unbounded sleep. To avoid priority inversions, a thread in a bounded sleep lends its priority to the owner of the lock that it is waiting for. .Pp The following primitives perform bounded sleeps: mutexes, reader/writer locks and read-mostly locks. .Pp The following primitives perform unbounded sleeps: sleepable read-mostly locks, shared/exclusive locks, lockmanager locks, counting semaphores, condition variables, and sleep/wakeup. .Ss General Principles .Bl -bullet .It It is an error to do any operation that could result in yielding the processor while holding a spin mutex. .It It is an error to do any operation that could result in unbounded sleep while holding any primitive from the 'bounded sleep' group. For example, it is an error to try to acquire a shared/exclusive lock while holding a mutex, or to try to allocate memory with M_WAITOK while holding a reader/writer lock. .Pp Note that the lock passed to one of the .Fn sleep or .Fn cv_wait functions is dropped before the thread enters the unbounded sleep and does not violate this rule. .It It is an error to do any operation that could result in yielding of the processor when running inside an interrupt filter. .It It is an error to do any operation that could result in unbounded sleep when running inside an interrupt thread. .El .Ss Interaction table The following table shows what you can and can not do while holding one of the locking primitives discussed. Note that .Dq sleep includes .Fn sema_wait , .Fn sema_timedwait , any of the .Fn cv_wait functions, and any of the .Fn sleep functions. .Bl -column ".Ic xxxxxxxxxxxxxxxx" ".Xr XXXXXXXXX" ".Xr XXXXXXXXX" ".Xr XXXXXXX" ".Xr XXXXXXXXX" ".Xr XXXXXX" -offset 3n .It Em " You want:" Ta spin mtx Ta mutex/rw Ta rmlock Ta sleep rm Ta sx/lk Ta sleep .It Em "You have: " Ta -------- Ta -------- Ta ------ Ta -------- Ta ------ Ta ------ .It spin mtx Ta \&ok Ta \&no Ta \&no Ta \&no Ta \&no Ta \&no-1 .It mutex/rw Ta \&ok Ta \&ok Ta \&ok Ta \&no Ta \&no Ta \&no-1 .It rmlock Ta \&ok Ta \&ok Ta \&ok Ta \&no Ta \&no Ta \&no-1 .It sleep rm Ta \&ok Ta \&ok Ta \&ok Ta \&ok-2 Ta \&ok-2 Ta \&ok-2/3 .It sx Ta \&ok Ta \&ok Ta \&ok Ta \&ok Ta \&ok Ta \&ok-3 .It lockmgr Ta \&ok Ta \&ok Ta \&ok Ta \&ok Ta \&ok Ta \&ok .El .Pp .Em *1 There are calls that atomically release this primitive when going to sleep and reacquire it on wakeup .Po .Fn mtx_sleep , .Fn rw_sleep , .Fn msleep_spin , etc. .Pc . .Pp .Em *2 These cases are only allowed while holding a write lock on a sleepable read-mostly lock. .Pp .Em *3 Though one can sleep while holding this lock, one can also use a .Fn sleep function to atomically release this primitive when going to sleep and reacquire it on wakeup. .Pp Note that non-blocking try operations on locks are always permitted. .Ss Context mode table The next table shows what can be used in different contexts. At this time this is a rather easy to remember table. .Bl -column ".Ic Xxxxxxxxxxxxxxxxxxx" ".Xr XXXXXXXXX" ".Xr XXXXXXXXX" ".Xr XXXXXXX" ".Xr XXXXXXXXX" ".Xr XXXXXX" -offset 3n .It Em "Context:" Ta spin mtx Ta mutex/rw Ta rmlock Ta sleep rm Ta sx/lk Ta sleep .It interrupt filter: Ta \&ok Ta \&no Ta \&no Ta \&no Ta \&no Ta \&no .It interrupt thread: Ta \&ok Ta \&ok Ta \&ok Ta \&no Ta \&no Ta \&no .It callout: Ta \&ok Ta \&ok Ta \&ok Ta \&no Ta \&no Ta \&no .It direct callout: Ta \&ok Ta \&no Ta \&no Ta \&no Ta \&no Ta \&no .It system call: Ta \&ok Ta \&ok Ta \&ok Ta \&ok Ta \&ok Ta \&ok .El .Sh SEE ALSO .Xr lockstat 1 , .Xr witness 4 , .Xr atomic 9 , .Xr BUS_SETUP_INTR 9 , +.Xr callout 9 , .Xr condvar 9 , .Xr epoch 9 , .Xr lock 9 , .Xr LOCK_PROFILING 9 , .Xr mtx_pool 9 , .Xr mutex 9 , .Xr rmlock 9 , .Xr rwlock 9 , .Xr sema 9 , .Xr sleep 9 , .Xr smr 9 , -.Xr sx 9 , -.Xr timeout 9 +.Xr sx 9 .Sh BUGS There are too many locking primitives to choose from. diff --git a/share/man/man9/sleep.9 b/share/man/man9/sleep.9 index 78625e5e1184..1a471bcfc00d 100644 --- a/share/man/man9/sleep.9 +++ b/share/man/man9/sleep.9 @@ -1,423 +1,423 @@ .\" .\" Copyright (c) 1996 Joerg Wunsch .\" .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR .\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES .\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. .\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT, .\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT .\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, .\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY .\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd June 19, 2019 .Dt SLEEP 9 .Os .Sh NAME .Nm msleep , .Nm msleep_sbt , .Nm msleep_spin , .Nm msleep_spin_sbt , .Nm pause , .Nm pause_sig , .Nm pause_sbt , .Nm tsleep , .Nm tsleep_sbt , .Nm wakeup , .Nm wakeup_one , .Nm wakeup_any .Nd wait for events .Sh SYNOPSIS .In sys/param.h .In sys/systm.h .In sys/proc.h .Ft int .Fn msleep "const void *chan" "struct mtx *mtx" "int priority" "const char *wmesg" "int timo" .Ft int .Fn msleep_sbt "const void *chan" "struct mtx *mtx" "int priority" \ "const char *wmesg" "sbintime_t sbt" "sbintime_t pr" "int flags" .Ft int .Fn msleep_spin "const void *chan" "struct mtx *mtx" "const char *wmesg" "int timo" .Ft int .Fn msleep_spin_sbt "const void *chan" "struct mtx *mtx" "const char *wmesg" \ "sbintime_t sbt" "sbintime_t pr" "int flags" .Ft int .Fn pause "const char *wmesg" "int timo" .Ft int .Fn pause_sig "const char *wmesg" "int timo" .Ft int .Fn pause_sbt "const char *wmesg" "sbintime_t sbt" "sbintime_t pr" \ "int flags" .Ft int .Fn tsleep "const void *chan" "int priority" "const char *wmesg" "int timo" .Ft int .Fn tsleep_sbt "const void *chan" "int priority" "const char *wmesg" \ "sbintime_t sbt" "sbintime_t pr" "int flags" .Ft void .Fn wakeup "const void *chan" .Ft void .Fn wakeup_one "const void *chan" .Ft void .Fn wakeup_any "const void *chan" .Sh DESCRIPTION The functions .Fn tsleep , .Fn msleep , .Fn msleep_spin , .Fn pause , .Fn pause_sig , .Fn pause_sbt , .Fn wakeup , .Fn wakeup_one , and .Fn wakeup_any handle event-based thread blocking. If a thread must wait for an external event, it is put to sleep by .Fn tsleep , .Fn msleep , .Fn msleep_spin , .Fn pause , .Fn pause_sig , or .Fn pause_sbt . Threads may also wait using one of the locking primitive sleep routines .Xr mtx_sleep 9 , .Xr rw_sleep 9 , or .Xr sx_sleep 9 . .Pp The parameter .Fa chan is an arbitrary address that uniquely identifies the event on which the thread is being put to sleep. All threads sleeping on a single .Fa chan are woken up later by .Fn wakeup , often called from inside an interrupt routine, to indicate that the resource the thread was blocking on is available now. .Pp The parameter .Fa priority specifies a new priority for the thread as well as some optional flags. If the new priority is not 0, then the thread will be made runnable with the specified .Fa priority when it resumes. .Dv PZERO should never be used, as it is for compatibility only. A new priority of 0 means to use the thread's current priority when it is made runnable again. .Pp If .Fa priority includes the .Dv PCATCH flag, pending signals are allowed to interrupt the sleep, otherwise pending signals are ignored during the sleep. If .Dv PCATCH is set and a signal becomes pending, .Er ERESTART is returned if the current system call should be restarted if possible, and .Er EINTR is returned if the system call should be interrupted by the signal (return .Er EINTR ) . .Pp The parameter .Fa wmesg is a string describing the sleep condition for tools like .Xr ps 1 . Due to the limited space of those programs to display arbitrary strings, this message should not be longer than 6 characters. .Pp The parameter .Fa timo specifies a timeout for the sleep. If .Fa timo is not 0, then the thread will sleep for at most .Fa timo No / Va hz seconds. If the timeout expires, then the sleep function will return .Er EWOULDBLOCK . .Pp .Fn msleep_sbt , .Fn msleep_spin_sbt , .Fn pause_sbt and .Fn tsleep_sbt functions take .Fa sbt parameter instead of .Fa timo . It allows the caller to specify relative or absolute wakeup time with higher resolution in form of .Vt sbintime_t . The parameter .Fa pr allows the caller to specify wanted absolute event precision. The parameter .Fa flags allows the caller to pass additional .Fn callout_reset_sbt flags. .Pp Several of the sleep functions including .Fn msleep , .Fn msleep_spin , and the locking primitive sleep routines specify an additional lock parameter. The lock will be released before sleeping and reacquired before the sleep routine returns. If .Fa priority includes the .Dv PDROP flag, then the lock will not be reacquired before returning. The lock is used to ensure that a condition can be checked atomically, and that the current thread can be suspended without missing a change to the condition, or an associated wakeup. In addition, all of the sleep routines will fully drop the .Va Giant mutex (even if recursed) while the thread is suspended and will reacquire the .Va Giant mutex before the function returns. Note that the .Va Giant mutex may be specified as the lock to drop. In that case, however, the .Dv PDROP flag is not allowed. .Pp To avoid lost wakeups, either a lock should be used to protect against races, or a timeout should be specified to place an upper bound on the delay due to a lost wakeup. As a result, the .Fn tsleep function should only be invoked with a timeout of 0 when the .Va Giant mutex is held. .Pp The .Fn msleep function requires that .Fa mtx reference a default, i.e. non-spin, mutex. Its use is deprecated in favor of .Xr mtx_sleep 9 which provides identical behavior. .Pp The .Fn msleep_spin function requires that .Fa mtx reference a spin mutex. The .Fn msleep_spin function does not accept a .Fa priority parameter and thus does not support changing the current thread's priority, the .Dv PDROP flag, or catching signals via the .Dv PCATCH flag. .Pp The .Fn pause function is a wrapper around .Fn tsleep that suspends execution of the current thread for the indicated timeout. The thread can not be awakened early by signals or calls to .Fn wakeup , .Fn wakeup_one or .Fn wakeup_any . The .Fn pause_sig function is a variant of .Fn pause which can be awakened early by signals. .Pp The .Fn wakeup_one function makes the first highest priority thread in the queue that is sleeping on the parameter .Fa chan runnable. This reduces the load when a large number of threads are sleeping on the same address, but only one of them can actually do any useful work when made runnable. .Pp Due to the way it works, the .Fn wakeup_one function requires that only related threads sleep on a specific .Fa chan address. It is the programmer's responsibility to choose a unique .Fa chan value. The older .Fn wakeup function did not require this, though it was never good practice for threads to share a .Fa chan value. When converting from .Fn wakeup to .Fn wakeup_one , pay particular attention to ensure that no other threads wait on the same .Fa chan . .Pp The .Fn wakeup_any function is similar to .Fn wakeup_one , except that it makes runnable last thread on the queue (sleeping less), ignoring fairness. It can be used when threads sleeping on the .Fa chan are known to be identical and there is no reason to be fair. .Pp If the timeout given by .Fa timo or .Fa sbt is based on an absolute real-time clock value, then the thread should copy the global .Va rtc_generation into its .Va td_rtcgen member before reading the RTC. If the real-time clock is adjusted, these functions will set .Va td_rtcgen to zero and return zero. The caller should reconsider its orientation with the new RTC value. .Sh RETURN VALUES When awakened by a call to .Fn wakeup or .Fn wakeup_one , if a signal is pending and .Dv PCATCH is specified, a non-zero error code is returned. If the thread is awakened by a call to .Fn wakeup or .Fn wakeup_one , the .Fn msleep , .Fn msleep_spin , .Fn tsleep , and locking primitive sleep functions return 0. Zero can also be returned when the real-time clock is adjusted; see above regarding .Va td_rtcgen . Otherwise, a non-zero error code is returned. .Sh ERRORS .Fn msleep , .Fn msleep_spin , .Fn tsleep , and the locking primitive sleep functions will fail if: .Bl -tag -width Er .It Bq Er EINTR The .Dv PCATCH flag was specified, a signal was caught, and the system call should be interrupted. .It Bq Er ERESTART The .Dv PCATCH flag was specified, a signal was caught, and the system call should be restarted. .It Bq Er EWOULDBLOCK A non-zero timeout was specified and the timeout expired. .El .Sh SEE ALSO .Xr ps 1 , +.Xr callout 9 , .Xr locking 9 , .Xr malloc 9 , .Xr mi_switch 9 , .Xr mtx_sleep 9 , .Xr rw_sleep 9 , -.Xr sx_sleep 9 , -.Xr timeout 9 +.Xr sx_sleep 9 .Sh HISTORY The functions .Fn sleep and .Fn wakeup were present in .At v1 . They were probably also present in the preceding PDP-7 version of .Ux . They were the basic process synchronization model. .Pp The .Fn tsleep function appeared in .Bx 4.4 and added the parameters .Fa wmesg and .Fa timo . The .Fn sleep function was removed in .Fx 2.2 . The .Fn wakeup_one function appeared in .Fx 2.2 . The .Fn msleep function appeared in .Fx 5.0 , and the .Fn msleep_spin function appeared in .Fx 6.2 . The .Fn pause function appeared in .Fx 7.0 . The .Fn pause_sig function appeared in .Fx 12.0 . .Sh AUTHORS .An -nosplit This manual page was written by .An J\(:org Wunsch Aq Mt joerg@FreeBSD.org . diff --git a/share/man/man9/sleepqueue.9 b/share/man/man9/sleepqueue.9 index 64dfc3bdee55..ae3bbaab7460 100644 --- a/share/man/man9/sleepqueue.9 +++ b/share/man/man9/sleepqueue.9 @@ -1,390 +1,390 @@ .\" Copyright (c) 2000-2004 John H. Baldwin .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR .\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES .\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. .\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT, .\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT .\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, .\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY .\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd June 19, 2019 .Dt SLEEPQUEUE 9 .Os .Sh NAME .Nm init_sleepqueues , .Nm sleepq_abort , .Nm sleepq_add , .Nm sleepq_alloc , .Nm sleepq_broadcast , .Nm sleepq_free , .Nm sleepq_lock , .Nm sleepq_lookup , .Nm sleepq_release , .Nm sleepq_remove , .Nm sleepq_signal , .Nm sleepq_set_timeout , .Nm sleepq_set_timeout_sbt , .Nm sleepq_sleepcnt , .Nm sleepq_timedwait , .Nm sleepq_timedwait_sig , .Nm sleepq_type , .Nm sleepq_wait , .Nm sleepq_wait_sig .Nd manage the queues of sleeping threads .Sh SYNOPSIS .In sys/param.h .In sys/sleepqueue.h .Ft void .Fn init_sleepqueues "void" .Ft int .Fn sleepq_abort "struct thread *td" .Ft void .Fn sleepq_add "const void *wchan" "struct lock_object *lock" "const char *wmesg" "int flags" "int queue" .Ft struct sleepqueue * .Fn sleepq_alloc "void" .Ft int .Fn sleepq_broadcast "const void *wchan" "int flags" "int pri" "int queue" .Ft void .Fn sleepq_free "struct sleepqueue *sq" .Ft struct sleepqueue * .Fn sleepq_lookup "const void *wchan" .Ft void .Fn sleepq_lock "const void *wchan" .Ft void .Fn sleepq_release "const void *wchan" .Ft void .Fn sleepq_remove "struct thread *td" "const void *wchan" .Ft int .Fn sleepq_signal "const void *wchan" "int flags" "int pri" "int queue" .Ft void .Fn sleepq_set_timeout "const void *wchan" "int timo" .Ft void .Fn sleepq_set_timeout_sbt "const void *wchan" "sbintime_t sbt" \ "sbintime_t pr" "int flags" .Ft u_int .Fn sleepq_sleepcnt "const void *wchan" "int queue" .Ft int .Fn sleepq_timedwait "const void *wchan" "int pri" .Ft int .Fn sleepq_timedwait_sig "const void *wchan" "int pri" .Ft int .Fn sleepq_type "const void *wchan" .Ft void .Fn sleepq_wait "const void *wchan" "int pri" .Ft int .Fn sleepq_wait_sig "const void *wchan" "int pri" .Sh DESCRIPTION Sleep queues provide a mechanism for suspending execution of a thread until some condition is met. Each queue is associated with a specific wait channel when it is active, and only one queue may be associated with a wait channel at any given point in time. The implementation of each wait channel splits its sleepqueue into 2 sub-queues in order to enable some optimizations on threads' wakeups. An active queue holds a list of threads that are blocked on the associated wait channel. Threads that are not blocked on a wait channel have an associated inactive sleep queue. When a thread blocks on a wait channel it donates its inactive sleep queue to the wait channel. When a thread is resumed, the wait channel that it was blocked on gives it an inactive sleep queue for later use. .Pp The .Fn sleepq_alloc function allocates an inactive sleep queue and is used to assign a sleep queue to a thread during thread creation. The .Fn sleepq_free function frees the resources associated with an inactive sleep queue and is used to free a queue during thread destruction. .Pp Active sleep queues are stored in a hash table hashed on the addresses pointed to by wait channels. Each bucket in the hash table contains a sleep queue chain. A sleep queue chain contains a spin mutex and a list of sleep queues that hash to that specific chain. Active sleep queues are protected by their chain's spin mutex. The .Fn init_sleepqueues function initializes the hash table of sleep queue chains. .Pp The .Fn sleepq_lock function locks the sleep queue chain associated with wait channel .Fa wchan . .Pp The .Fn sleepq_lookup returns a pointer to the currently active sleep queue for that wait channel associated with .Fa wchan or .Dv NULL if there is no active sleep queue associated with argument .Fa wchan . It requires the sleep queue chain associated with .Fa wchan to have been locked by a prior call to .Fn sleepq_lock . .Pp The .Fn sleepq_release function unlocks the sleep queue chain associated with .Fn wchan and is primarily useful when aborting a pending sleep request before one of the wait functions is called. .Pp The .Fn sleepq_add function places the current thread on the sleep queue associated with the wait channel .Fa wchan . The sleep queue chain associated with argument .Fa wchan must be locked by a prior call to .Fn sleepq_lock when this function is called. If a lock is specified via the .Fa lock argument, and if the kernel was compiled with .Cd "options INVARIANTS" , then the sleep queue code will perform extra checks to ensure that the lock is used by all threads sleeping on .Fa wchan . The .Fa wmesg parameter should be a short description of .Fa wchan . The .Fa flags parameter is a bitmask consisting of the type of sleep queue being slept on and zero or more optional flags. The .Fa queue parameter specifies the sub-queue, in which the contending thread will be inserted. .Pp There are currently three types of sleep queues: .Pp .Bl -tag -width ".Dv SLEEPQ_CONDVAR" -compact .It Dv SLEEPQ_CONDVAR A sleep queue used to implement condition variables. .It Dv SLEEPQ_SLEEP A sleep queue used to implement .Xr sleep 9 , .Xr wakeup 9 and .Xr wakeup_one 9 . .It Dv SLEEPQ_PAUSE A sleep queue used to implement .Xr pause 9 . .El .Pp There are currently two optional flag: .Pp .Bl -tag -width ".Dv SLEEPQ_INTERRUPTIBLE" -compact .It Dv SLEEPQ_INTERRUPTIBLE The current thread is entering an interruptible sleep. .El .Bl -tag -width ".Dv SLEEPQ_STOP_ON_BDRY" -compact .It Dv SLEEPQ_STOP_ON_BDRY When thread is entering an interruptible sleep, do not stop it upon arrival of stop action, like .Dv SIGSTOP . Wake it up instead. .El .Pp A timeout on the sleep may be specified by calling .Fn sleepq_set_timeout after .Fn sleepq_add . The .Fa wchan parameter should be the same value from the preceding call to .Fn sleepq_add , and the sleep queue chain associated with .Fa wchan must have been locked by a prior call to .Fn sleepq_lock . The .Fa timo parameter should specify the timeout value in ticks. .Pp .Fn sleepq_set_timeout_sbt function takes .Fa sbt argument instead of .Fa timo . It allows to specify relative or absolute wakeup time with higher resolution in form of .Vt sbintime_t . The parameter .Fa pr allows to specify wanted absolute event precision. The parameter .Fa flags allows to pass additional .Fn callout_reset_sbt flags. .Pp Once the thread is ready to suspend, one of the wait functions is called to put the current thread to sleep until it is awakened and to context switch to another thread. The .Fn sleepq_wait function is used for non-interruptible sleeps that do not have a timeout. The .Fn sleepq_timedwait function is used for non-interruptible sleeps that have had a timeout set via .Fn sleepq_set_timeout . The .Fn sleepq_wait_sig function is used for interruptible sleeps that do not have a timeout. The .Fn sleepq_timedwait_sig function is used for interruptible sleeps that do have a timeout set. The .Fa wchan argument to all of the wait functions is the wait channel being slept on. The sleep queue chain associated with argument .Fa wchan needs to have been locked with a prior call to .Fn sleepq_lock . The .Fa pri argument is used to set the priority of the thread when it is awakened. If it is set to zero, the thread's priority is left alone. .Pp When the thread is resumed, the wait functions return a non-zero value if the thread was awakened due to an interrupt other than a signal or a timeout. If the sleep timed out, then .Er EWOULDBLOCK is returned. If the sleep was interrupted by something other than a signal, then some other return value will be returned. .Pp A sleeping thread is normally resumed by the .Fn sleepq_broadcast and .Fn sleepq_signal functions. The .Fn sleepq_signal function awakens the highest priority thread sleeping on a wait channel (if SLEEPQ_UNFAIR flag is set, thread that went to sleep recently) while .Fn sleepq_broadcast awakens all of the threads sleeping on a wait channel. The .Fa wchan argument specifics which wait channel to awaken. The .Fa flags argument must match the sleep queue type contained in the .Fa flags argument passed to .Fn sleepq_add by the threads sleeping on the wait channel. If the .Fa pri argument does not equal \-1, then each thread that is awakened will have its priority raised to .Fa pri if it has a lower priority. The sleep queue chain associated with argument .Fa wchan must be locked by a prior call to .Fn sleepq_lock before calling any of these functions. The .Fa queue argument specifies the sub-queue, from which threads need to be woken up. .Pp A thread in an interruptible sleep can be interrupted by another thread via the .Fn sleepq_abort function. The .Fa td argument specifies the thread to interrupt. An individual thread can also be awakened from sleeping on a specific wait channel via the .Fn sleepq_remove function. The .Fa td argument specifies the thread to awaken and the .Fa wchan argument specifies the wait channel to awaken it from. If the thread .Fa td is not blocked on the wait channel .Fa wchan then this function will not do anything, even if the thread is asleep on a different wait channel. This function should only be used if one of the other functions above is not sufficient. One possible use is waking up a specific thread from a widely shared sleep channel. .Pp The .Fn sleepq_sleepcnt function offer a simple way to retrieve the number of threads sleeping for the specified .Fa queue , given a .Fa wchan . .Pp The .Fn sleepq_type function returns the type of .Fa wchan associated to a sleepqueue. .Pp The .Fn sleepq_abort , .Fn sleepq_broadcast , and .Fn sleepq_signal functions all return a boolean value. If the return value is true, then at least one thread was resumed that is currently swapped out. The caller is responsible for awakening the scheduler process so that the resumed thread will be swapped back in. This is done by calling the .Fn kick_proc0 function after releasing the sleep queue chain lock via a call to .Fn sleepq_release . .Pp The sleep queue interface is currently used to implement the .Xr sleep 9 and .Xr condvar 9 interfaces. Almost all other code in the kernel should use one of those interfaces rather than manipulating sleep queues directly. .Sh SEE ALSO +.Xr callout 9 , .Xr condvar 9 , .Xr runqueue 9 , .Xr scheduler 9 , -.Xr sleep 9 , -.Xr timeout 9 +.Xr sleep 9 diff --git a/share/man/man9/tvtohz.9 b/share/man/man9/tvtohz.9 index cf07d603a790..d3918bdb85c6 100644 --- a/share/man/man9/tvtohz.9 +++ b/share/man/man9/tvtohz.9 @@ -1,58 +1,58 @@ .\" Copyright (c) 2000 Kelly Yancey .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd January 3, 2000 .Dt TVTOHZ 9 .Os .Sh NAME .Nm tvtohz .Nd convert time interval to tick count .Sh SYNOPSIS .In sys/time.h .Ft int .Fn tvtohz "struct timeval *tv" .Sh DESCRIPTION The .Fn tvtohz function accepts a single argument .Fa tv which specifies the time interval over which to calculate the number of system ticks that would elapse. .Sh RETURN VALUES Returns the integral number of system ticks expected to elapse in the given interval, including the current tick. .Sh SEE ALSO +.Xr callout 9 , .Xr microtime 9 , -.Xr microuptime 9 , -.Xr timeout 9 +.Xr microuptime 9 .Sh HISTORY The .Nm function first appeared in .Fx 3.0 .Sh AUTHORS This manual page was written by .An Kelly Yancey Aq Mt kbyanc@posi.net . diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c index 0ac0eca36da4..05497f3d46d7 100644 --- a/sys/kern/kern_timeout.c +++ b/sys/kern/kern_timeout.c @@ -1,1554 +1,1554 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_callout_profiling.h" #include "opt_ddb.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #include #endif #ifdef SMP #include #endif DPCPU_DECLARE(sbintime_t, hardclocktime); SDT_PROVIDER_DEFINE(callout_execute); SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *"); SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *"); static void softclock_thread(void *arg); #ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); static int avg_gcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0, "Average number of Giant callouts made per softclock call. Units = 1/1000"); static int avg_lockcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, "Average number of lock callouts made per softclock call. Units = 1/1000"); static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); static int avg_depth_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, "Average number of direct callouts examined per callout_process call. " "Units = 1/1000"); static int avg_lockcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " "callout_process call. Units = 1/1000"); static int avg_mpcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, 0, "Average number of MP direct callouts made per callout_process call. " "Units = 1/1000"); #endif static int ncallout; SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0, "Number of entries in callwheel and size of timeout() preallocation"); #ifdef RSS static int pin_default_swi = 1; static int pin_pcpu_swi = 1; #else static int pin_default_swi = 0; static int pin_pcpu_swi = 0; #endif SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi, 0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)"); SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi, 0, "Pin the per-CPU swis (except PCPU 0, which is also default)"); /* * TODO: * allocate more timeout table slots when table overflows. */ static u_int __read_mostly callwheelsize; static u_int __read_mostly callwheelmask; /* * The callout cpu exec entities represent informations necessary for * describing the state of callouts currently running on the CPU and the ones * necessary for migrating callouts to the new callout cpu. In particular, * the first entry of the array cc_exec_entity holds informations for callout * running in SWI thread context, while the second one holds informations * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ struct cc_exec { struct callout *cc_curr; callout_func_t *cc_drain; void *cc_last_func; void *cc_last_arg; #ifdef SMP callout_func_t *ce_migration_func; void *ce_migration_arg; sbintime_t ce_migration_time; sbintime_t ce_migration_prec; int ce_migration_cpu; #endif bool cc_cancel; bool cc_waiting; }; /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. */ struct callout_cpu { struct mtx_padalign cc_lock; struct cc_exec cc_exec_entity[2]; struct callout *cc_next; struct callout_list *cc_callwheel; struct callout_tailq cc_expireq; sbintime_t cc_firstevent; sbintime_t cc_lastscan; struct thread *cc_thread; u_int cc_bucket; #ifdef KTR char cc_ktr_event_name[20]; #endif }; #define callout_migrating(c) ((c)->c_iflags & CALLOUT_DFRMIGRATION) #define cc_exec_curr(cc, dir) cc->cc_exec_entity[dir].cc_curr #define cc_exec_last_func(cc, dir) cc->cc_exec_entity[dir].cc_last_func #define cc_exec_last_arg(cc, dir) cc->cc_exec_entity[dir].cc_last_arg #define cc_exec_drain(cc, dir) cc->cc_exec_entity[dir].cc_drain #define cc_exec_next(cc) cc->cc_next #define cc_exec_cancel(cc, dir) cc->cc_exec_entity[dir].cc_cancel #define cc_exec_waiting(cc, dir) cc->cc_exec_entity[dir].cc_waiting #ifdef SMP #define cc_migration_func(cc, dir) cc->cc_exec_entity[dir].ce_migration_func #define cc_migration_arg(cc, dir) cc->cc_exec_entity[dir].ce_migration_arg #define cc_migration_cpu(cc, dir) cc->cc_exec_entity[dir].ce_migration_cpu #define cc_migration_time(cc, dir) cc->cc_exec_entity[dir].ce_migration_time #define cc_migration_prec(cc, dir) cc->cc_exec_entity[dir].ce_migration_prec static struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU #define CC_CPU(cpu) (&cc_cpu[(cpu)]) #define CC_SELF() CC_CPU(PCPU_GET(cpuid)) #else static struct callout_cpu cc_cpu; #define CC_CPU(cpu) (&cc_cpu) #define CC_SELF() (&cc_cpu) #endif #define CC_LOCK(cc) mtx_lock_spin(&(cc)->cc_lock) #define CC_UNLOCK(cc) mtx_unlock_spin(&(cc)->cc_lock) #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) static int __read_mostly cc_default_cpu; static void callout_cpu_init(struct callout_cpu *cc, int cpu); static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: * cc_curr - If a callout is in progress, it is cc_curr. * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock_call_cc() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when * cc_curr is non-NULL. */ /* * Resets the execution entity tied to a specific callout cpu. */ static void cc_cce_cleanup(struct callout_cpu *cc, int direct) { cc_exec_curr(cc, direct) = NULL; cc_exec_cancel(cc, direct) = false; cc_exec_waiting(cc, direct) = false; #ifdef SMP cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif } /* * Checks if migration is requested by a specific callout cpu. */ static int cc_cce_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP return (cc_migration_cpu(cc, direct) != CPUBLOCK); #else return (0); #endif } /* * Kernel low level callwheel initialization * called on the BSP during kernel startup. */ static void callout_callwheel_init(void *dummy) { struct callout_cpu *cc; int cpu; /* * Calculate the size of the callout wheel and the preallocated * timeout() structures. * XXX: Clip callout to result of previous function of maxusers * maximum 384. This is still huge, but acceptable. */ ncallout = imin(16 + maxproc + maxfiles, 18508); TUNABLE_INT_FETCH("kern.ncallout", &ncallout); /* * Calculate callout wheel size, should be next power of two higher * than 'ncallout'. */ callwheelsize = 1 << fls(ncallout); callwheelmask = callwheelsize - 1; /* * Fetch whether we're pinning the swi's or not. */ TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi); TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi); /* * Initialize callout wheels. The software interrupt threads * are created later. */ cc_default_cpu = PCPU_GET(cpuid); CPU_FOREACH(cpu) { cc = CC_CPU(cpu); callout_cpu_init(cc, cpu); } } SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL); /* * Initialize the per-cpu callout structures. */ static void callout_cpu_init(struct callout_cpu *cc, int cpu) { int i; mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN); cc->cc_callwheel = malloc_domainset(sizeof(struct callout_list) * callwheelsize, M_CALLOUT, DOMAINSET_PREF(pcpu_find(cpu)->pc_domain), M_WAITOK); for (i = 0; i < callwheelsize; i++) LIST_INIT(&cc->cc_callwheel[i]); TAILQ_INIT(&cc->cc_expireq); cc->cc_firstevent = SBT_MAX; for (i = 0; i < 2; i++) cc_cce_cleanup(cc, i); #ifdef KTR snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name), "callwheel cpu %d", cpu); #endif } #ifdef SMP /* * Switches the cpu tied to a specific callout. * The function expects a locked incoming callout cpu and returns with * locked outcoming callout cpu. */ static struct callout_cpu * callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu) { struct callout_cpu *new_cc; MPASS(c != NULL && cc != NULL); CC_LOCK_ASSERT(cc); /* * Avoid interrupts and preemption firing after the callout cpu * is blocked in order to avoid deadlocks as the new thread * may be willing to acquire the callout cpu lock. */ c->c_cpu = CPUBLOCK; spinlock_enter(); CC_UNLOCK(cc); new_cc = CC_CPU(new_cpu); CC_LOCK(new_cc); spinlock_exit(); c->c_cpu = new_cpu; return (new_cc); } #endif /* * Start softclock threads. */ static void start_softclock(void *dummy) { struct proc *p; struct thread *td; struct callout_cpu *cc; int cpu, error; bool pin_swi; p = NULL; CPU_FOREACH(cpu) { cc = CC_CPU(cpu); error = kproc_kthread_add(softclock_thread, cc, &p, &td, RFSTOPPED, 0, "clock", "clock (%d)", cpu); if (error != 0) panic("failed to create softclock thread for cpu %d: %d", cpu, error); CC_LOCK(cc); cc->cc_thread = td; thread_lock(td); sched_class(td, PRI_ITHD); sched_ithread_prio(td, PI_SOFTCLOCK); TD_SET_IWAIT(td); thread_lock_set(td, (struct mtx *)&cc->cc_lock); thread_unlock(td); if (cpu == cc_default_cpu) pin_swi = pin_default_swi; else pin_swi = pin_pcpu_swi; if (pin_swi) { error = cpuset_setithread(td->td_tid, cpu); if (error != 0) printf("%s: %s clock couldn't be pinned to cpu %d: %d\n", __func__, cpu == cc_default_cpu ? "default" : "per-cpu", cpu, error); } } } SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); #define CC_HASH_SHIFT 8 static inline u_int callout_hash(sbintime_t sbt) { return (sbt >> (32 - CC_HASH_SHIFT)); } static inline u_int callout_get_bucket(sbintime_t sbt) { return (callout_hash(sbt) & callwheelmask); } void callout_process(sbintime_t now) { struct callout_entropy { struct callout_cpu *cc; struct thread *td; sbintime_t now; } entropy; struct callout *c, *next; struct callout_cpu *cc; struct callout_list *sc; struct thread *td; sbintime_t first, last, lookahead, max, tmp_max; u_int firstb, lastb, nowb; #ifdef CALLOUT_PROFILING int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; #endif cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); /* Compute the buckets of the last scan and present times. */ firstb = callout_hash(cc->cc_lastscan); cc->cc_lastscan = now; nowb = callout_hash(now); /* Compute the last bucket and minimum time of the bucket after it. */ if (nowb == firstb) lookahead = (SBT_1S / 16); else if (nowb - firstb == 1) lookahead = (SBT_1S / 8); else lookahead = SBT_1S; first = last = now; first += (lookahead / 2); last += lookahead; last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); lastb = callout_hash(last) - 1; max = last; /* * Check if we wrapped around the entire wheel from the last scan. * In case, we need to scan entirely the wheel for pending callouts. */ if (lastb - firstb >= callwheelsize) { lastb = firstb + callwheelsize - 1; if (nowb - firstb >= callwheelsize) nowb = lastb; } /* Iterate callwheel from firstb to nowb and then up to lastb. */ do { sc = &cc->cc_callwheel[firstb & callwheelmask]; LIST_FOREACH_SAFE(c, sc, c_links.le, next) { /* Run the callout if present time within allowed. */ if (c->c_time <= now) { /* * Consumer told us the callout may be run * directly from hardware interrupt context. */ if (c->c_iflags & CALLOUT_DIRECT) { #ifdef CALLOUT_PROFILING ++depth_dir; #endif cc_exec_next(cc) = next; cc->cc_bucket = firstb & callwheelmask; LIST_REMOVE(c, c_links.le); softclock_call_cc(c, cc, #ifdef CALLOUT_PROFILING &mpcalls_dir, &lockcalls_dir, NULL, #endif 1); next = cc_exec_next(cc); cc_exec_next(cc) = NULL; } else { LIST_REMOVE(c, c_links.le); TAILQ_INSERT_TAIL(&cc->cc_expireq, c, c_links.tqe); c->c_iflags |= CALLOUT_PROCESSED; } } else if (c->c_time >= max) { /* * Skip events in the distant future. */ ; } else if (c->c_time > last) { /* * Event minimal time is bigger than present * maximal time, so it cannot be aggregated. */ lastb = nowb; } else { /* * Update first and last time, respecting this * event. */ if (c->c_time < first) first = c->c_time; tmp_max = c->c_time + c->c_precision; if (tmp_max < last) last = tmp_max; } } /* Proceed with the next bucket. */ firstb++; /* * Stop if we looked after present time and found * some event we can't execute at now. * Stop if we looked far enough into the future. */ } while (((int)(firstb - lastb)) <= 0); cc->cc_firstevent = last; cpu_new_callout(curcpu, last, first); #ifdef CALLOUT_PROFILING avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; #endif if (!TAILQ_EMPTY(&cc->cc_expireq)) { entropy.cc = cc; entropy.td = curthread; entropy.now = now; random_harvest_queue(&entropy, sizeof(entropy), RANDOM_CALLOUT); td = cc->cc_thread; if (TD_AWAITING_INTR(td)) { thread_lock_block_wait(td); THREAD_LOCK_ASSERT(td, MA_OWNED); TD_CLR_IWAIT(td); sched_wakeup(td, SRQ_INTR); } else mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); } else mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); } static struct callout_cpu * callout_lock(struct callout *c) { struct callout_cpu *cc; int cpu; for (;;) { cpu = c->c_cpu; #ifdef SMP if (cpu == CPUBLOCK) { while (c->c_cpu == CPUBLOCK) cpu_spinwait(); continue; } #endif cc = CC_CPU(cpu); CC_LOCK(cc); if (cpu == c->c_cpu) break; CC_UNLOCK(cc); } return (cc); } static void callout_cc_add(struct callout *c, struct callout_cpu *cc, sbintime_t sbt, sbintime_t precision, void (*func)(void *), void *arg, int flags) { int bucket; CC_LOCK_ASSERT(cc); if (sbt < cc->cc_lastscan) sbt = cc->cc_lastscan; c->c_arg = arg; c->c_iflags |= CALLOUT_PENDING; c->c_iflags &= ~CALLOUT_PROCESSED; c->c_flags |= CALLOUT_ACTIVE; if (flags & C_DIRECT_EXEC) c->c_iflags |= CALLOUT_DIRECT; c->c_func = func; c->c_time = sbt; c->c_precision = precision; bucket = callout_get_bucket(c->c_time); CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", c, (int)(c->c_precision >> 32), (u_int)(c->c_precision & 0xffffffff)); LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); if (cc->cc_bucket == bucket) cc_exec_next(cc) = c; /* * Inform the eventtimers(4) subsystem there's a new callout * that has been inserted, but only if really required. */ if (SBT_MAX - c->c_time < c->c_precision) c->c_precision = SBT_MAX - c->c_time; sbt = c->c_time + c->c_precision; if (sbt < cc->cc_firstevent) { cc->cc_firstevent = sbt; cpu_new_callout(c->c_cpu, sbt, c->c_time); } } static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct) { struct rm_priotracker tracker; callout_func_t *c_func, *drain; void *c_arg; struct lock_class *class; struct lock_object *c_lock; uintptr_t lock_status; int c_iflags; #ifdef SMP struct callout_cpu *new_cc; callout_func_t *new_func; void *new_arg; int flags, new_cpu; sbintime_t new_prec, new_time; #endif #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbintime_t sbt1, sbt2; struct timespec ts2; static sbintime_t maxdt = 2 * SBT_1MS; /* 2 msec */ static callout_func_t *lastfunc; #endif KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING, ("softclock_call_cc: pend %p %x", c, c->c_iflags)); KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE, ("softclock_call_cc: act %p %x", c, c->c_flags)); class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; lock_status = 0; if (c->c_iflags & CALLOUT_SHAREDLOCK) { if (class == &lock_class_rm) lock_status = (uintptr_t)&tracker; else lock_status = 1; } c_lock = c->c_lock; c_func = c->c_func; c_arg = c->c_arg; c_iflags = c->c_iflags; c->c_iflags &= ~CALLOUT_PENDING; cc_exec_curr(cc, direct) = c; cc_exec_last_func(cc, direct) = c_func; cc_exec_last_arg(cc, direct) = c_arg; cc_exec_cancel(cc, direct) = false; cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, lock_status); /* * The callout may have been cancelled * while we switched locks. */ if (cc_exec_cancel(cc, direct)) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ cc_exec_cancel(cc, direct) = true; if (c_lock == &Giant.lock_object) { #ifdef CALLOUT_PROFILING (*gcalls)++; #endif CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p", c, c_func, c_arg); } else { #ifdef CALLOUT_PROFILING (*lockcalls)++; #endif CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } } else { #ifdef CALLOUT_PROFILING (*mpcalls)++; #endif CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running", "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt1 = sbinuptime(); #endif THREAD_NO_SLEEPING(); SDT_PROBE1(callout_execute, , , callout__start, c); c_func(c_arg); SDT_PROBE1(callout_execute, , , callout__end, c); THREAD_SLEEPING_OK(); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt2 = sbinuptime(); sbt2 -= sbt1; if (sbt2 > maxdt) { if (lastfunc != c_func || sbt2 > maxdt * 2) { ts2 = sbttots(sbt2); printf( - "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", + "Expensive callout(9) function: %p(%p) %jd.%09ld s\n", c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec); } maxdt = sbt2; lastfunc = c_func; } #endif KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle"); CTR1(KTR_CALLOUT, "callout %p finished", c); if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0) class->lc_unlock(c_lock); skip: CC_LOCK(cc); KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr")); cc_exec_curr(cc, direct) = NULL; if (cc_exec_drain(cc, direct)) { drain = cc_exec_drain(cc, direct); cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); drain(c_arg); CC_LOCK(cc); } if (cc_exec_waiting(cc, direct)) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ if (cc_cce_migrating(cc, direct)) { cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not * destroyed but that is not easy. */ c->c_iflags &= ~CALLOUT_DFRMIGRATION; } cc_exec_waiting(cc, direct) = false; CC_UNLOCK(cc); wakeup(&cc_exec_waiting(cc, direct)); CC_LOCK(cc); } else if (cc_cce_migrating(cc, direct)) { #ifdef SMP /* * If the callout was scheduled for * migration just perform it now. */ new_cpu = cc_migration_cpu(cc, direct); new_time = cc_migration_time(cc, direct); new_prec = cc_migration_prec(cc, direct); new_func = cc_migration_func(cc, direct); new_arg = cc_migration_arg(cc, direct); cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed * but that is not easy. * * As first thing, handle deferred callout stops. */ if (!callout_migrating(c)) { CTR3(KTR_CALLOUT, "deferred cancelled %p func %p arg %p", c, new_func, new_arg); return; } c->c_iflags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); flags = (direct) ? C_DIRECT_EXEC : 0; callout_cc_add(c, new_cc, new_time, new_prec, new_func, new_arg, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else panic("migration should not happen"); #endif } } /* * The callout mechanism is based on the work of Adam M. Costello and * George Varghese, published in a technical report entitled "Redesigning * the BSD Callout and Timer Facilities" and modified slightly for inclusion * in FreeBSD by Justin T. Gibbs. The original work on the data structures * used in this implementation was published by G. Varghese and T. Lauck in * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for * the Efficient Implementation of a Timer Facility" in the Proceedings of * the 11th ACM Annual Symposium on Operating Systems Principles, * Austin, Texas Nov 1987. */ /* * Software (low priority) clock interrupt thread handler. * Run periodic events from timeout queue. */ static void softclock_thread(void *arg) { struct thread *td = curthread; struct callout_cpu *cc; struct callout *c; #ifdef CALLOUT_PROFILING int depth, gcalls, lockcalls, mpcalls; #endif cc = (struct callout_cpu *)arg; CC_LOCK(cc); for (;;) { while (TAILQ_EMPTY(&cc->cc_expireq)) { /* * Use CC_LOCK(cc) as the thread_lock while * idle. */ thread_lock(td); thread_lock_set(td, (struct mtx *)&cc->cc_lock); TD_SET_IWAIT(td); mi_switch(SW_VOL | SWT_IWAIT); /* mi_switch() drops thread_lock(). */ CC_LOCK(cc); } #ifdef CALLOUT_PROFILING depth = gcalls = lockcalls = mpcalls = 0; #endif while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); softclock_call_cc(c, cc, #ifdef CALLOUT_PROFILING &mpcalls, &lockcalls, &gcalls, #endif 0); #ifdef CALLOUT_PROFILING ++depth; #endif } #ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; #endif } } void callout_when(sbintime_t sbt, sbintime_t precision, int flags, sbintime_t *res, sbintime_t *prec_res) { sbintime_t to_sbt, to_pr; if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) { *res = sbt; *prec_res = precision; return; } if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt) sbt = tick_sbt; if ((flags & C_HARDCLOCK) != 0 || sbt >= sbt_tickthreshold) { /* * Obtain the time of the last hardclock() call on * this CPU directly from the kern_clocksource.c. * This value is per-CPU, but it is equal for all * active ones. */ #ifdef __LP64__ to_sbt = DPCPU_GET(hardclocktime); #else spinlock_enter(); to_sbt = DPCPU_GET(hardclocktime); spinlock_exit(); #endif if (cold && to_sbt == 0) to_sbt = sbinuptime(); if ((flags & C_HARDCLOCK) == 0) to_sbt += tick_sbt; } else to_sbt = sbinuptime(); if (SBT_MAX - to_sbt < sbt) to_sbt = SBT_MAX; else to_sbt += sbt; *res = to_sbt; to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : sbt >> C_PRELGET(flags)); *prec_res = to_pr > precision ? to_pr : precision; } /* * New interface; clients allocate their own callout structures. * * callout_reset() - establish or change a timeout * callout_stop() - disestablish a timeout * callout_init() - initialize a callout structure so that it can * safely be passed to callout_reset() and callout_stop() * * defines three convenience macros: * * callout_active() - returns truth if callout has not been stopped, * drained, or deactivated since the last time the callout was * reset. * callout_pending() - returns truth if callout is still waiting for timeout * callout_deactivate() - marks the callout as having been serviced */ int callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec, callout_func_t *ftn, void *arg, int cpu, int flags) { sbintime_t to_sbt, precision; struct callout_cpu *cc; int cancelled, direct; cancelled = 0; callout_when(sbt, prec, flags, &to_sbt, &precision); /* * This flag used to be added by callout_cc_add, but the * first time you call this we could end up with the * wrong direct flag if we don't do it before we add. */ if (flags & C_DIRECT_EXEC) { direct = 1; } else { direct = 0; } KASSERT(!direct || c->c_lock == NULL || (LOCK_CLASS(c->c_lock)->lc_flags & LC_SPINLOCK), ("%s: direct callout %p has non-spin lock", __func__, c)); cc = callout_lock(c); if (cpu == -1) cpu = c->c_cpu; KASSERT(cpu >= 0 && cpu <= mp_maxid && !CPU_ABSENT(cpu), ("%s: invalid cpu %d", __func__, cpu)); if (cc_exec_curr(cc, direct) == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ if (c->c_lock != NULL && !cc_exec_cancel(cc, direct)) cancelled = cc_exec_cancel(cc, direct) = true; if (cc_exec_waiting(cc, direct) || cc_exec_drain(cc, direct)) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. */ CTR4(KTR_CALLOUT, "%s %p func %p arg %p", cancelled ? "cancelled" : "failed to cancel", c, c->c_func, c->c_arg); CC_UNLOCK(cc); return (cancelled); } #ifdef SMP if (callout_migrating(c)) { /* * This only occurs when a second callout_reset_sbt_on * is made after a previous one moved it into * deferred migration (below). Note we do *not* change * the prev_cpu even though the previous target may * be different. */ cc_migration_cpu(cc, direct) = cpu; cc_migration_time(cc, direct) = to_sbt; cc_migration_prec(cc, direct) = precision; cc_migration_func(cc, direct) = ftn; cc_migration_arg(cc, direct) = arg; cancelled = 1; CC_UNLOCK(cc); return (cancelled); } #endif } if (c->c_iflags & CALLOUT_PENDING) { if ((c->c_iflags & CALLOUT_PROCESSED) == 0) { if (cc_exec_next(cc) == c) cc_exec_next(cc) = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); } cancelled = 1; c->c_iflags &= ~ CALLOUT_PENDING; c->c_flags &= ~ CALLOUT_ACTIVE; } #ifdef SMP /* * If the callout must migrate try to perform it immediately. * If the callout is currently running, just defer the migration * to a more appropriate moment. */ if (c->c_cpu != cpu) { if (cc_exec_curr(cc, direct) == c) { /* * Pending will have been removed since we are * actually executing the callout on another * CPU. That callout should be waiting on the * lock the caller holds. If we set both * active/and/pending after we return and the * lock on the executing callout proceeds, it * will then see pending is true and return. * At the return from the actual callout execution * the migration will occur in softclock_call_cc * and this new callout will be placed on the * new CPU via a call to callout_cpu_switch() which * will get the lock on the right CPU followed * by a call callout_cc_add() which will add it there. * (see above in softclock_call_cc()). */ cc_migration_cpu(cc, direct) = cpu; cc_migration_time(cc, direct) = to_sbt; cc_migration_prec(cc, direct) = precision; cc_migration_func(cc, direct) = ftn; cc_migration_arg(cc, direct) = arg; c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING); c->c_flags |= CALLOUT_ACTIVE; CTR6(KTR_CALLOUT, "migration of %p func %p arg %p in %d.%08x to %u deferred", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff), cpu); CC_UNLOCK(cc); return (cancelled); } cc = callout_cpu_switch(c, cc, cpu); } #endif callout_cc_add(c, cc, to_sbt, precision, ftn, arg, flags); CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff)); CC_UNLOCK(cc); return (cancelled); } /* * Common idioms that can be optimized in the future. */ int callout_schedule_on(struct callout *c, int to_ticks, int cpu) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu); } int callout_schedule(struct callout *c, int to_ticks) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu); } int _callout_stop_safe(struct callout *c, int flags, callout_func_t *drain) { struct callout_cpu *cc, *old_cc; struct lock_class *class; int direct, sq_locked, use_lock; int cancelled, not_on_a_list; if ((flags & CS_DRAIN) != 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock, "calling %s", __func__); KASSERT((flags & CS_DRAIN) == 0 || drain == NULL, ("Cannot set drain callback and CS_DRAIN flag at the same time")); /* * Some old subsystems don't hold Giant while running a callout_stop(), * so just discard this check for the moment. */ if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) { if (c->c_lock == &Giant.lock_object) use_lock = mtx_owned(&Giant); else { use_lock = 1; class = LOCK_CLASS(c->c_lock); class->lc_assert(c->c_lock, LA_XLOCKED); } } else use_lock = 0; if (c->c_iflags & CALLOUT_DIRECT) { direct = 1; } else { direct = 0; } sq_locked = 0; old_cc = NULL; again: cc = callout_lock(c); if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) == (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) && ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) { /* * Special case where this slipped in while we * were migrating *as* the callout is about to * execute. The caller probably holds the lock * the callout wants. * * Get rid of the migration first. Then set * the flag that tells this code *not* to * try to remove it from any lists (its not * on one yet). When the callout wheel runs, * it will ignore this callout. */ c->c_iflags &= ~CALLOUT_PENDING; c->c_flags &= ~CALLOUT_ACTIVE; not_on_a_list = 1; } else { not_on_a_list = 0; } /* * If the callout was migrating while the callout cpu lock was * dropped, just drop the sleepqueue lock and check the states * again. */ if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); sleepq_release(&cc_exec_waiting(old_cc, direct)); sq_locked = 0; old_cc = NULL; goto again; #else panic("migration should not happen"); #endif } /* * If the callout is running, try to stop it or drain it. */ if (cc_exec_curr(cc, direct) == c) { /* * Succeed we to stop it or not, we must clear the * active flag - this is what API users expect. If we're * draining and the callout is currently executing, first wait * until it finishes. */ if ((flags & CS_DRAIN) == 0) c->c_flags &= ~CALLOUT_ACTIVE; if ((flags & CS_DRAIN) != 0) { /* * The current callout is running (or just * about to run) and blocking is allowed, so * just wait for the current invocation to * finish. */ if (cc_exec_curr(cc, direct) == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid * a LOR between cc_lock and sleepqueue * chain spinlocks. This piece of code * emulates a msleep_spin() call actually. * * If we already have the sleepqueue chain * locked, then we can safely block. If we * don't already have it locked, however, * we have to drop the cc_lock to lock * it. This opens several races, so we * restart at the beginning once we have * both locks. If nothing has changed, then * we will end up back here with sq_locked * set. */ if (!sq_locked) { CC_UNLOCK(cc); sleepq_lock( &cc_exec_waiting(cc, direct)); sq_locked = 1; old_cc = cc; goto again; } /* * Migration could be cancelled here, but * as long as it is still not sure when it * will be packed up, just let softclock() * take care of it. */ cc_exec_waiting(cc, direct) = true; DROP_GIANT(); CC_UNLOCK(cc); sleepq_add( &cc_exec_waiting(cc, direct), &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); sleepq_wait( &cc_exec_waiting(cc, direct), 0); sq_locked = 0; old_cc = NULL; /* Reacquire locks previously released. */ PICKUP_GIANT(); goto again; } c->c_flags &= ~CALLOUT_ACTIVE; } else if (use_lock && !cc_exec_cancel(cc, direct) && (drain == NULL)) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout * and return. After our caller drops the * lock, the callout will be skipped in * softclock(). This *only* works with a * callout_stop() *not* callout_drain() or * callout_async_drain(). */ cc_exec_cancel(cc, direct) = true; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); KASSERT(!cc_cce_migrating(cc, direct), ("callout wrongly scheduled for migration")); if (callout_migrating(c)) { c->c_iflags &= ~CALLOUT_DFRMIGRATION; #ifdef SMP cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif } CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); return (1); } else if (callout_migrating(c)) { /* * The callout is currently being serviced * and the "next" callout is scheduled at * its completion with a migration. We remove * the migration flag so it *won't* get rescheduled, * but we can't stop the one thats running so * we return 0. */ c->c_iflags &= ~CALLOUT_DFRMIGRATION; #ifdef SMP /* * We can't call cc_cce_cleanup here since * if we do it will remove .ce_curr and * its still running. This will prevent a * reschedule of the callout when the * execution completes. */ cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p", c, c->c_func, c->c_arg); if (drain) { KASSERT(cc_exec_drain(cc, direct) == NULL, ("callout drain function already set to %p", cc_exec_drain(cc, direct))); cc_exec_drain(cc, direct) = drain; } CC_UNLOCK(cc); return (0); } else { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); if (drain) { KASSERT(cc_exec_drain(cc, direct) == NULL, ("callout drain function already set to %p", cc_exec_drain(cc, direct))); cc_exec_drain(cc, direct) = drain; } } KASSERT(!sq_locked, ("sleepqueue chain still locked")); cancelled = 0; } else cancelled = 1; if (sq_locked) sleepq_release(&cc_exec_waiting(cc, direct)); if ((c->c_iflags & CALLOUT_PENDING) == 0) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); /* * For not scheduled and not executing callout return * negative value. */ if (cc_exec_curr(cc, direct) != c) cancelled = -1; CC_UNLOCK(cc); return (cancelled); } c->c_iflags &= ~CALLOUT_PENDING; c->c_flags &= ~CALLOUT_ACTIVE; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); if (not_on_a_list == 0) { if ((c->c_iflags & CALLOUT_PROCESSED) == 0) { if (cc_exec_next(cc) == c) cc_exec_next(cc) = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); } } CC_UNLOCK(cc); return (cancelled); } void callout_init(struct callout *c, int mpsafe) { bzero(c, sizeof *c); if (mpsafe) { c->c_lock = NULL; c->c_iflags = CALLOUT_RETURNUNLOCKED; } else { c->c_lock = &Giant.lock_object; c->c_iflags = 0; } c->c_cpu = cc_default_cpu; } void _callout_init_lock(struct callout *c, struct lock_object *lock, int flags) { bzero(c, sizeof *c); c->c_lock = lock; KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, ("callout_init_lock: bad flags %d", flags)); KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0, ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock")); KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags & LC_SLEEPABLE), ("%s: callout %p has sleepable lock", __func__, c)); c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK); c->c_cpu = cc_default_cpu; } static int flssbt(sbintime_t sbt) { sbt += (uint64_t)sbt >> 1; if (sizeof(long) >= sizeof(sbintime_t)) return (flsl(sbt)); if (sbt >= SBT_1S) return (flsl(((uint64_t)sbt) >> 32) + 32); return (flsl(sbt)); } /* * Dump immediate statistic snapshot of the scheduled callouts. */ static int sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS) { struct callout *tmp; struct callout_cpu *cc; struct callout_list *sc; sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t; int ct[64], cpr[64], ccpbk[32]; int error, val, i, count, tcum, pcum, maxc, c, medc; int cpu; val = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); count = maxc = 0; st = spr = maxt = maxpr = 0; bzero(ccpbk, sizeof(ccpbk)); bzero(ct, sizeof(ct)); bzero(cpr, sizeof(cpr)); now = sbinuptime(); CPU_FOREACH(cpu) { cc = CC_CPU(cpu); CC_LOCK(cc); for (i = 0; i < callwheelsize; i++) { sc = &cc->cc_callwheel[i]; c = 0; LIST_FOREACH(tmp, sc, c_links.le) { c++; t = tmp->c_time - now; if (t < 0) t = 0; st += t / SBT_1US; spr += tmp->c_precision / SBT_1US; if (t > maxt) maxt = t; if (tmp->c_precision > maxpr) maxpr = tmp->c_precision; ct[flssbt(t)]++; cpr[flssbt(tmp->c_precision)]++; } if (c > maxc) maxc = c; ccpbk[fls(c + c / 2)]++; count += c; } CC_UNLOCK(cc); } for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++) tcum += ct[i]; medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++) pcum += cpr[i]; medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, c = 0; i < 32 && c < count / 2; i++) c += ccpbk[i]; medc = (i >= 2) ? (1 << (i - 2)) : 0; printf("Scheduled callouts statistic snapshot:\n"); printf(" Callouts: %6d Buckets: %6d*%-3d Bucket size: 0.%06ds\n", count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT); printf(" C/Bk: med %5d avg %6d.%06jd max %6d\n", medc, count / callwheelsize / mp_ncpus, (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000, maxc); printf(" Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32, (st / count) / 1000000, (st / count) % 1000000, maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32); printf(" Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32, (spr / count) / 1000000, (spr / count) % 1000000, maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32); printf(" Distribution: \tbuckets\t time\t tcum\t" " prec\t pcum\n"); for (i = 0, tcum = pcum = 0; i < 64; i++) { if (ct[i] == 0 && cpr[i] == 0) continue; t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0; tcum += ct[i]; pcum += cpr[i]; printf(" %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n", t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32, i - 1 - (32 - CC_HASH_SHIFT), ct[i], tcum, cpr[i], pcum); } return (error); } SYSCTL_PROC(_kern, OID_AUTO, callout_stat, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_callout_stat, "I", "Dump immediate statistic snapshot of the scheduled callouts"); #ifdef DDB static void _show_callout(struct callout *c) { db_printf("callout %p\n", c); #define C_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, c->e); db_printf(" &c_links = %p\n", &(c->c_links)); C_DB_PRINTF("%" PRId64, c_time); C_DB_PRINTF("%" PRId64, c_precision); C_DB_PRINTF("%p", c_arg); C_DB_PRINTF("%p", c_func); C_DB_PRINTF("%p", c_lock); C_DB_PRINTF("%#x", c_flags); C_DB_PRINTF("%#x", c_iflags); C_DB_PRINTF("%d", c_cpu); #undef C_DB_PRINTF } DB_SHOW_COMMAND(callout, db_show_callout) { if (!have_addr) { db_printf("usage: show callout \n"); return; } _show_callout((struct callout *)addr); } static void _show_last_callout(int cpu, int direct, const char *dirstr) { struct callout_cpu *cc; void *func, *arg; cc = CC_CPU(cpu); func = cc_exec_last_func(cc, direct); arg = cc_exec_last_arg(cc, direct); db_printf("cpu %d last%s callout function: %p ", cpu, dirstr, func); db_printsym((db_expr_t)func, DB_STGY_ANY); db_printf("\ncpu %d last%s callout argument: %p\n", cpu, dirstr, arg); } DB_SHOW_COMMAND_FLAGS(callout_last, db_show_callout_last, DB_CMD_MEMSAFE) { int cpu, last; if (have_addr) { if (addr < 0 || addr > mp_maxid || CPU_ABSENT(addr)) { db_printf("no such cpu: %d\n", (int)addr); return; } cpu = last = addr; } else { cpu = 0; last = mp_maxid; } while (cpu <= last) { if (!CPU_ABSENT(cpu)) { _show_last_callout(cpu, 0, ""); _show_last_callout(cpu, 1, " direct"); } cpu++; } } #endif /* DDB */