diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index 22dd49b65ede..1b05640df93f 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -1,673 +1,662 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KTRACE #include #include #endif #include static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) int hogticks; int lbolt; static struct callout loadav_callout; static struct callout lbolt_callout; struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. */ static fixpt_t cexp[3] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ static int fscale __unused = FSCALE; SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); static void endtsleep(void *); static void loadav(void *arg); static void lboltcb(void *arg); /* * We're only looking at 7 bits of the address; everything is * aligned to 4, lots of things are aligned to greater powers * of 2. Shift right by 8, i.e. drop the bottom 256 worth. */ #define TABLESIZE 128 static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE]; #define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) void sleepinit(void) { int i; hogticks = (hz / 10) * 2; /* Default only. */ for (i = 0; i < TABLESIZE; i++) TAILQ_INIT(&slpque[i]); } /* * General sleep call. Suspends the current process until a wakeup is * performed on the specified identifier. The process will then be made * runnable with the specified priority. Sleeps at most timo/hz seconds * (0 means no timeout). If pri includes PCATCH flag, signals are checked * before and after sleeping, else signals are not checked. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal needs to be delivered, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). * * The mutex argument is exited before the caller is suspended, and * entered before msleep returns. If priority includes the PDROP * flag the mutex is not entered before returning. */ int msleep(ident, mtx, priority, wmesg, timo) void *ident; struct mtx *mtx; int priority, timo; const char *wmesg; { struct thread *td = curthread; struct proc *p = td->td_proc; int sig, catch = priority & PCATCH; int rval = 0; WITNESS_SAVE_DECL(mtx); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0); #endif /* XXX: mtx == NULL ?? */ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mtx->mtx_object, "Sleeping on \"%s\"", wmesg); KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL, ("sleeping without a mutex")); /* * If we are capable of async syscalls and there isn't already * another one ready to return, start a new thread * and queue it as ready to run. Note that there is danger here * because we need to make sure that we don't sleep allocating * the thread (recursion here might be bad). */ mtx_lock_spin(&sched_lock); if (p->p_flag & P_SA || p->p_numthreads > 1) { /* * Just don't bother if we are exiting * and not the exiting thread or thread was marked as * interrupted. */ if (catch) { if ((p->p_flag & P_WEXIT) && p->p_singlethread != td) { mtx_unlock_spin(&sched_lock); return (EINTR); } if (td->td_flags & TDF_INTERRUPT) { mtx_unlock_spin(&sched_lock); return (td->td_intrval); } } } if (cold ) { /* * During autoconfiguration, just return; * don't run any other procs or panic below, * in case this is the idle process and already asleep. * XXX: this used to do "s = splhigh(); splx(safepri); * splx(s);" to give interrupts a chance, but there is * no way to give interrupts a chance now. */ if (mtx != NULL && priority & PDROP) mtx_unlock(mtx); mtx_unlock_spin(&sched_lock); return (0); } DROP_GIANT(); if (mtx != NULL) { mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(&mtx->mtx_object, mtx); mtx_unlock(mtx); if (priority & PDROP) mtx = NULL; } KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); CTR5(KTR_PROC, "msleep: thread %p (pid %d, %s) on %s (%p)", td, p->p_pid, p->p_comm, wmesg, ident); td->td_wchan = ident; td->td_wmesg = wmesg; TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], td, td_slpq); TD_SET_ON_SLEEPQ(td); if (timo) callout_reset(&td->td_slpcallout, timo, endtsleep, td); /* * We put ourselves on the sleep queue and start our timeout * before calling thread_suspend_check, as we could stop there, and * a wakeup or a SIGCONT (or both) could occur while we were stopped. * without resuming us, thus we must be ready for sleep * when cursig is called. If the wakeup happens while we're * stopped, td->td_wchan will be 0 upon return from cursig. */ if (catch) { CTR3(KTR_PROC, "msleep caught: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); td->td_flags |= TDF_SINTR; mtx_unlock_spin(&sched_lock); PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); sig = cursig(td); mtx_unlock(&p->p_sigacts->ps_mtx); if (sig == 0 && thread_suspend_check(1)) sig = SIGSTOP; mtx_lock_spin(&sched_lock); PROC_UNLOCK(p); if (sig != 0) { if (TD_ON_SLEEPQ(td)) unsleep(td); } else if (!TD_ON_SLEEPQ(td)) catch = 0; } else sig = 0; /* * Let the scheduler know we're about to voluntarily go to sleep. */ sched_sleep(td, priority & PRIMASK); if (TD_ON_SLEEPQ(td)) { p->p_stats->p_ru.ru_nvcsw++; TD_SET_SLEEPING(td); mi_switch(); } /* * We're awake from voluntary sleep. */ CTR3(KTR_PROC, "msleep resume: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); td->td_flags &= ~TDF_SINTR; if (td->td_flags & TDF_TIMEOUT) { td->td_flags &= ~TDF_TIMEOUT; if (sig == 0) rval = EWOULDBLOCK; } else if (td->td_flags & TDF_TIMOFAIL) { td->td_flags &= ~TDF_TIMOFAIL; } else if (timo && callout_stop(&td->td_slpcallout) == 0) { /* * This isn't supposed to be pretty. If we are here, then * the endtsleep() callout is currently executing on another * CPU and is either spinning on the sched_lock or will be * soon. If we don't synchronize here, there is a chance * that this process may msleep() again before the callout * has a chance to run and the callout may end up waking up * the wrong msleep(). Yuck. */ TD_SET_SLEEPING(td); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); td->td_flags &= ~TDF_TIMOFAIL; } if ((td->td_flags & TDF_INTERRUPT) && (priority & PCATCH) && (rval == 0)) { rval = td->td_intrval; } mtx_unlock_spin(&sched_lock); if (rval == 0 && catch) { PROC_LOCK(p); /* XXX: shouldn't we always be calling cursig()? */ mtx_lock(&p->p_sigacts->ps_mtx); if (sig != 0 || (sig = cursig(td))) { if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) rval = EINTR; else rval = ERESTART; } mtx_unlock(&p->p_sigacts->ps_mtx); PROC_UNLOCK(p); } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0); #endif PICKUP_GIANT(); if (mtx != NULL) { mtx_lock(mtx); WITNESS_RESTORE(&mtx->mtx_object, mtx); } return (rval); } /* * Implement timeout for msleep(). * * If process hasn't been awakened (wchan non-zero), * set timeout flag and undo the sleep. If proc * is stopped, just unsleep so it will remain stopped. * MP-safe, called without the Giant mutex. */ static void endtsleep(arg) void *arg; { register struct thread *td; td = (struct thread *)arg; CTR3(KTR_PROC, "endtsleep: thread %p (pid %d, %s)", td, td->td_proc->p_pid, td->td_proc->p_comm); mtx_lock_spin(&sched_lock); /* * This is the other half of the synchronization with msleep() * described above. If the TDS_TIMEOUT flag is set, we lost the * race and just need to put the process back on the runqueue. */ if (TD_ON_SLEEPQ(td)) { TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq); TD_CLR_ON_SLEEPQ(td); td->td_flags |= TDF_TIMEOUT; td->td_wmesg = NULL; } else td->td_flags |= TDF_TIMOFAIL; TD_CLR_SLEEPING(td); setrunnable(td); mtx_unlock_spin(&sched_lock); } /* * Abort a thread, as if an interrupt had occured. Only abort * interruptable waits (unfortunatly it isn't only safe to abort others). * This is about identical to cv_abort(). * Think about merging them? * Also, whatever the signal code does... */ void abortsleep(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); /* * If the TDF_TIMEOUT flag is set, just leave. A * timeout is scheduled anyhow. */ if ((td->td_flags & (TDF_TIMEOUT | TDF_SINTR)) == TDF_SINTR) { if (TD_ON_SLEEPQ(td)) { unsleep(td); TD_CLR_SLEEPING(td); setrunnable(td); } } } /* * Remove a process from its wait queue */ void unsleep(struct thread *td) { mtx_lock_spin(&sched_lock); if (TD_ON_SLEEPQ(td)) { TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq); TD_CLR_ON_SLEEPQ(td); td->td_wmesg = NULL; } mtx_unlock_spin(&sched_lock); } /* * Make all processes sleeping on the specified identifier runnable. */ void wakeup(ident) register void *ident; { register struct slpquehead *qp; register struct thread *td; struct thread *ntd; struct proc *p; mtx_lock_spin(&sched_lock); qp = &slpque[LOOKUP(ident)]; restart: for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { ntd = TAILQ_NEXT(td, td_slpq); if (td->td_wchan == ident) { unsleep(td); TD_CLR_SLEEPING(td); setrunnable(td); p = td->td_proc; CTR3(KTR_PROC,"wakeup: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); goto restart; } } mtx_unlock_spin(&sched_lock); } /* * Make a process sleeping on the specified identifier runnable. * May wake more than one process if a target process is currently * swapped out. */ void wakeup_one(ident) register void *ident; { register struct proc *p; register struct slpquehead *qp; register struct thread *td; struct thread *ntd; mtx_lock_spin(&sched_lock); qp = &slpque[LOOKUP(ident)]; for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { ntd = TAILQ_NEXT(td, td_slpq); if (td->td_wchan == ident) { unsleep(td); TD_CLR_SLEEPING(td); setrunnable(td); p = td->td_proc; CTR3(KTR_PROC,"wakeup1: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); break; } } mtx_unlock_spin(&sched_lock); } /* * The machine independent parts of mi_switch(). */ void mi_switch(void) { struct bintime new_switchtime; struct thread *td; - struct thread *newtd; struct proc *p; - u_int sched_nest; mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); td = curthread; /* XXX */ p = td->td_proc; /* XXX */ KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td)) mtx_assert(&Giant, MA_NOTOWNED); #endif KASSERT(td->td_critnest == 1, ("mi_switch: switch in a critical section")); /* * Compute the amount of time during which the current * process was running, and add that to its total so far. */ binuptime(&new_switchtime); bintime_add(&p->p_runtime, &new_switchtime); bintime_sub(&p->p_runtime, PCPU_PTR(switchtime)); td->td_generation++; /* bump preempt-detect counter */ #ifdef DDB /* * Don't perform context switches from the debugger. */ if (db_active) { mtx_unlock_spin(&sched_lock); db_print_backtrace(); db_error("Context switches not allowed in the debugger"); } #endif /* * Check if the process exceeds its cpu resource allocation. If * over max, arrange to kill the process in ast(). */ if (p->p_cpulimit != RLIM_INFINITY && p->p_runtime.sec > p->p_cpulimit) { p->p_sflag |= PS_XCPU; td->td_flags |= TDF_ASTPENDING; } /* * Finish up stats for outgoing thread. */ cnt.v_swtch++; PCPU_SET(switchtime, new_switchtime); CTR3(KTR_PROC, "mi_switch: old thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); - sched_nest = sched_lock.mtx_recurse; if (td->td_proc->p_flag & P_SA) thread_switchout(td); - sched_switchout(td); - - newtd = choosethread(); - if (td != newtd) - cpu_switch(td, newtd); /* SHAZAM!! */ - - sched_lock.mtx_recurse = sched_nest; - sched_lock.mtx_lock = (uintptr_t)td; - sched_switchin(td); + sched_switch(td); /* * Start setting up stats etc. for the incoming thread. * Similar code in fork_exit() is returned to by cpu_switch() * in the case of a new thread/process. */ CTR3(KTR_PROC, "mi_switch: new thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); if (PCPU_GET(switchtime.sec) == 0) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* * If the last thread was exiting, finish cleaning it up. */ if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } } /* * Change process state to be runnable, * placing it on the run queue if it is in memory, * and awakening the swapper if it isn't in memory. */ void setrunnable(struct thread *td) { struct proc *p; p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); switch (p->p_state) { case PRS_ZOMBIE: panic("setrunnable(1)"); default: break; } switch (td->td_state) { case TDS_RUNNING: case TDS_RUNQ: return; case TDS_INHIBITED: /* * If we are only inhibited because we are swapped out * then arange to swap in this process. Otherwise just return. */ if (td->td_inhibitors != TDI_SWAPPED) return; /* XXX: intentional fall-through ? */ case TDS_CAN_RUN: break; default: printf("state is 0x%x", td->td_state); panic("setrunnable(2)"); } if ((p->p_sflag & PS_INMEM) == 0) { if ((p->p_sflag & PS_SWAPPINGIN) == 0) { p->p_sflag |= PS_SWAPINREQ; wakeup(&proc0); } } else sched_wakeup(td); } /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. * XXXKSE Needs complete rewrite when correct info is available. * Completely Bogus.. only works with 1:1 (but compiles ok now :-) */ static void loadav(void *arg) { int i, nrun; struct loadavg *avg; struct proc *p; struct thread *td; avg = &averunnable; sx_slock(&allproc_lock); nrun = 0; FOREACH_PROC_IN_SYSTEM(p) { FOREACH_THREAD_IN_PROC(p, td) { switch (td->td_state) { case TDS_RUNQ: case TDS_RUNNING: if ((p->p_flag & P_NOLOAD) != 0) goto nextproc; nrun++; /* XXXKSE */ default: break; } nextproc: continue; } } sx_sunlock(&allproc_lock); for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; /* * Schedule the next update to occur after 5 seconds, but add a * random variation to avoid synchronisation with processes that * run at regular intervals. */ callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)), loadav, NULL); } static void lboltcb(void *arg) { wakeup(&lbolt); callout_reset(&lbolt_callout, hz, lboltcb, NULL); } /* ARGSUSED */ static void sched_setup(dummy) void *dummy; { callout_init(&loadav_callout, 0); callout_init(&lbolt_callout, CALLOUT_MPSAFE); /* Kick off timeout driven events by calling first time. */ loadav(NULL); lboltcb(NULL); } /* * General purpose yield system call */ int yield(struct thread *td, struct yield_args *uap) { struct ksegrp *kg; kg = td->td_ksegrp; mtx_assert(&Giant, MA_NOTOWNED); mtx_lock_spin(&sched_lock); kg->kg_proc->p_stats->p_ru.ru_nvcsw++; sched_prio(td, PRI_MAX_TIMESHARE); mi_switch(); mtx_unlock_spin(&sched_lock); td->td_retval[0] = 0; return (0); } diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c index 7f11c4bca066..542dce4dcd14 100644 --- a/sys/kern/sched_4bsd.c +++ b/sys/kern/sched_4bsd.c @@ -1,726 +1,727 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include /* * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in * the range 100-256 Hz (approximately). */ #define ESTCPULIM(e) \ min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ #define NICE_WEIGHT 1 /* Priorities per nice level. */ struct ke_sched { int ske_cpticks; /* (j) Ticks of cpu time. */ }; static struct ke_sched ke_sched; struct ke_sched *kse0_sched = &ke_sched; struct kg_sched *ksegrp0_sched = NULL; struct p_sched *proc0_sched = NULL; struct td_sched *thread0_sched = NULL; static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ static struct callout schedcpu_callout; static struct callout roundrobin_callout; static void roundrobin(void *arg); static void schedcpu(void *arg); static void sched_setup(void *dummy); static void maybe_resched(struct thread *td); static void updatepri(struct ksegrp *kg); static void resetpriority(struct ksegrp *kg); SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) /* * Global run queue. */ static struct runq runq; SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq) static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val; new_val = sched_quantum * tick; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val < tick) return (EINVAL); sched_quantum = new_val / tick; hogticks = 2 * sched_quantum; return (0); } SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof sched_quantum, sysctl_kern_quantum, "I", "Roundrobin scheduling quantum in microseconds"); /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. */ static void maybe_resched(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); if (td->td_priority < curthread->td_priority && curthread->td_kse) curthread->td_flags |= TDF_NEEDRESCHED; } /* * Force switch among equal priority processes every 100ms. * We don't actually need to force a context switch of the current process. * The act of firing the event triggers a context switch to softclock() and * then switching back out again which is equivalent to a preemption, thus * no further work is needed on the local CPU. */ /* ARGSUSED */ static void roundrobin(void *arg) { #ifdef SMP mtx_lock_spin(&sched_lock); forward_roundrobin(); mtx_unlock_spin(&sched_lock); #endif callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL); } /* * Constants for digital decay and forget: * 90% of (kg_estcpu) usage in 5 * loadav time * 95% of (ke_pctcpu) usage in 60 seconds (load insensitive) * Note that, as ps(1) mentions, this can let percentages * total over 100% (I've seen 137.9% for 3 processes). * * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously. * * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds. * That is, the system wants to compute a value of decay such * that the following for loop: * for (i = 0; i < (5 * loadavg); i++) * kg_estcpu *= decay; * will compute * kg_estcpu *= 0.1; * for all values of loadavg: * * Mathematically this loop can be expressed by saying: * decay ** (5 * loadavg) ~= .1 * * The system computes decay as: * decay = (2 * loadavg) / (2 * loadavg + 1) * * We wish to prove that the system's computation of decay * will always fulfill the equation: * decay ** (5 * loadavg) ~= .1 * * If we compute b as: * b = 2 * loadavg * then * decay = b / (b + 1) * * We now need to prove two things: * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) * * Facts: * For x close to zero, exp(x) =~ 1 + x, since * exp(x) = 0! + x**1/1! + x**2/2! + ... . * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. * For x close to zero, ln(1+x) =~ x, since * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). * ln(.1) =~ -2.30 * * Proof of (1): * Solve (factor)**(power) =~ .1 given power (5*loadav): * solving for factor, * ln(factor) =~ (-2.30/5*loadav), or * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED * * Proof of (2): * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): * solving for power, * power*ln(b/(b+1)) =~ -2.30, or * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED * * Actual power values for the implemented algorithm are as follows: * loadav: 1 2 3 4 * power: 5.68 10.32 14.94 19.55 */ /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ #define loadfactor(loadav) (2 * (loadav)) #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) /* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); /* * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). * * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). * * If you don't want to bother with the faster/more-accurate formula, you * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate * (more general) method of calculating the %age of CPU used by a process. */ #define CCPU_SHIFT 11 /* * Recompute process priorities, every hz ticks. * MP-safe, called without the Giant mutex. */ /* ARGSUSED */ static void schedcpu(void *arg) { register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); struct thread *td; struct proc *p; struct kse *ke; struct ksegrp *kg; int awake, realstathz; realstathz = stathz ? stathz : hz; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { /* * Prevent state changes and protect run queue. */ mtx_lock_spin(&sched_lock); /* * Increment time in/out of memory. We ignore overflow; with * 16-bit int's (remember them?) overflow takes 45 days. */ p->p_swtime++; FOREACH_KSEGRP_IN_PROC(p, kg) { awake = 0; FOREACH_KSE_IN_GROUP(kg, ke) { /* * Increment sleep time (if sleeping). We * ignore overflow, as above. */ /* * The kse slptimes are not touched in wakeup * because the thread may not HAVE a KSE. */ if (ke->ke_state == KES_ONRUNQ) { awake = 1; ke->ke_flags &= ~KEF_DIDRUN; } else if ((ke->ke_state == KES_THREAD) && (TD_IS_RUNNING(ke->ke_thread))) { awake = 1; /* Do not clear KEF_DIDRUN */ } else if (ke->ke_flags & KEF_DIDRUN) { awake = 1; ke->ke_flags &= ~KEF_DIDRUN; } /* * ke_pctcpu is only for ps and ttyinfo(). * Do it per kse, and add them up at the end? * XXXKSE */ ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >> FSHIFT; /* * If the kse has been idle the entire second, * stop recalculating its priority until * it wakes up. */ if (ke->ke_sched->ske_cpticks == 0) continue; #if (FSHIFT >= CCPU_SHIFT) ke->ke_pctcpu += (realstathz == 100) ? ((fixpt_t) ke->ke_sched->ske_cpticks) << (FSHIFT - CCPU_SHIFT) : 100 * (((fixpt_t) ke->ke_sched->ske_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else ke->ke_pctcpu += ((FSCALE - ccpu) * (ke->ke_sched->ske_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif ke->ke_sched->ske_cpticks = 0; } /* end of kse loop */ /* * If there are ANY running threads in this KSEGRP, * then don't count it as sleeping. */ if (awake) { if (kg->kg_slptime > 1) { /* * In an ideal world, this should not * happen, because whoever woke us * up from the long sleep should have * unwound the slptime and reset our * priority before we run at the stale * priority. Should KASSERT at some * point when all the cases are fixed. */ updatepri(kg); } kg->kg_slptime = 0; } else kg->kg_slptime++; if (kg->kg_slptime > 1) continue; kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu); resetpriority(kg); FOREACH_THREAD_IN_GROUP(kg, td) { if (td->td_priority >= PUSER) { sched_prio(td, kg->kg_user_pri); } } } /* end of ksegrp loop */ mtx_unlock_spin(&sched_lock); } /* end of process loop */ sx_sunlock(&allproc_lock); callout_reset(&schedcpu_callout, hz, schedcpu, NULL); } /* * Recalculate the priority of a process after it has slept for a while. * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at * least six times the loadfactor will decay kg_estcpu to zero. */ static void updatepri(struct ksegrp *kg) { register fixpt_t loadfac; register unsigned int newcpu; loadfac = loadfactor(averunnable.ldavg[0]); if (kg->kg_slptime > 5 * loadfac) kg->kg_estcpu = 0; else { newcpu = kg->kg_estcpu; kg->kg_slptime--; /* was incremented in schedcpu() */ while (newcpu && --kg->kg_slptime) newcpu = decay_cpu(loadfac, newcpu); kg->kg_estcpu = newcpu; } resetpriority(kg); } /* * Compute the priority of a process when running in user mode. * Arrange to reschedule if the resulting priority is better * than that of the current process. */ static void resetpriority(struct ksegrp *kg) { register unsigned int newpriority; struct thread *td; if (kg->kg_pri_class == PRI_TIMESHARE) { newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT + NICE_WEIGHT * (kg->kg_nice - PRIO_MIN); newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), PRI_MAX_TIMESHARE); kg->kg_user_pri = newpriority; } FOREACH_THREAD_IN_GROUP(kg, td) { maybe_resched(td); /* XXXKSE silly */ } } /* ARGSUSED */ static void sched_setup(void *dummy) { if (sched_quantum == 0) sched_quantum = SCHED_QUANTUM; hogticks = 2 * sched_quantum; callout_init(&schedcpu_callout, CALLOUT_MPSAFE); callout_init(&roundrobin_callout, 0); /* Kick off timeout driven events by calling first time. */ roundrobin(NULL); schedcpu(NULL); } /* External interfaces start here */ int sched_runnable(void) { return runq_check(&runq); } int sched_rr_interval(void) { if (sched_quantum == 0) sched_quantum = SCHED_QUANTUM; return (sched_quantum); } /* * We adjust the priority of the current process. The priority of * a process gets worse as it accumulates CPU time. The cpu usage * estimator (kg_estcpu) is increased here. resetpriority() will * compute a different priority each time kg_estcpu increases by * INVERSE_ESTCPU_WEIGHT * (until MAXPRI is reached). The cpu usage estimator ramps up * quite quickly when the process is running (linearly), and decays * away exponentially, at a rate which is proportionally slower when * the system is busy. The basic principle is that the system will * 90% forget that the process used a lot of CPU time in 5 * loadav * seconds. This causes the system to favor processes which haven't * run much recently, and to round-robin among other processes. */ void sched_clock(struct thread *td) { struct ksegrp *kg; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); kg = td->td_ksegrp; ke = td->td_kse; ke->ke_sched->ske_cpticks++; kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1); if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { resetpriority(kg); if (td->td_priority >= PUSER) td->td_priority = kg->kg_user_pri; } } /* * charge childs scheduling cpu usage to parent. * * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp. * Charge it to the ksegrp that did the wait since process estcpu is sum of * all ksegrps, this is strictly as expected. Assume that the child process * aggregated all the estcpu into the 'built-in' ksegrp. */ void sched_exit(struct proc *p, struct proc *p1) { sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_exit_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_exit_kse(struct kse *ke, struct kse *child) { } void sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) { mtx_assert(&sched_lock, MA_OWNED); kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + child->kg_estcpu); } void sched_exit_thread(struct thread *td, struct thread *child) { } void sched_fork(struct proc *p, struct proc *p1) { sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_fork_kse(struct kse *ke, struct kse *child) { child->ke_sched->ske_cpticks = 0; } void sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) { mtx_assert(&sched_lock, MA_OWNED); child->kg_estcpu = kg->kg_estcpu; } void sched_fork_thread(struct thread *td, struct thread *child) { } void sched_nice(struct ksegrp *kg, int nice) { PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); kg->kg_nice = nice; resetpriority(kg); } void sched_class(struct ksegrp *kg, int class) { mtx_assert(&sched_lock, MA_OWNED); kg->kg_pri_class = class; } /* * Adjust the priority of a thread. * This may include moving the thread within the KSEGRP, * changing the assignment of a kse to the thread, * and moving a KSE in the system run queue. */ void sched_prio(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); if (TD_ON_RUNQ(td)) { adjustrunqueue(td, prio); } else { td->td_priority = prio; } } void sched_sleep(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); td->td_ksegrp->kg_slptime = 0; td->td_priority = prio; } void -sched_switchin(struct thread *td) -{ - - mtx_assert(&sched_lock, MA_OWNED); - td->td_oncpu = PCPU_GET(cpuid); -} - -void -sched_switchout(struct thread *td) +sched_switch(struct thread *td) { + struct thread *newtd; + u_long sched_nest; struct kse *ke; struct proc *p; ke = td->td_kse; p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_state == KES_THREAD), ("mi_switch: kse state?")); td->td_lastcpu = td->td_oncpu; td->td_last_kse = ke; td->td_oncpu = NOCPU; td->td_flags &= ~TDF_NEEDRESCHED; /* * At the last moment, if this thread is still marked RUNNING, * then put it back on the run queue as it has not been suspended * or stopped or any thing else similar. */ if (TD_IS_RUNNING(td)) { /* Put us back on the run queue (kse and all). */ setrunqueue(td); } else if (p->p_flag & P_SA) { /* * We will not be on the run queue. So we must be * sleeping or similar. As it's available, * someone else can use the KSE if they need it. */ kse_reassign(ke); } + sched_nest = sched_lock.mtx_recurse; + newtd = choosethread(); + if (td != newtd) + cpu_switch(td, newtd); + sched_lock.mtx_recurse = sched_nest; + sched_lock.mtx_lock = (uintptr_t)td; + td->td_oncpu = PCPU_GET(cpuid); } void sched_wakeup(struct thread *td) { struct ksegrp *kg; mtx_assert(&sched_lock, MA_OWNED); kg = td->td_ksegrp; if (kg->kg_slptime > 1) updatepri(kg); kg->kg_slptime = 0; setrunqueue(td); maybe_resched(td); } void sched_add(struct thread *td) { struct kse *ke; ke = td->td_kse; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("runq_add: No KSE on thread")); KASSERT(ke->ke_state != KES_ONRUNQ, ("runq_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_add: process swapped out")); ke->ke_ksegrp->kg_runq_kses++; ke->ke_state = KES_ONRUNQ; runq_add(&runq, ke); } void sched_rem(struct thread *td) { struct kse *ke; ke = td->td_kse; KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_remove: process swapped out")); KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); mtx_assert(&sched_lock, MA_OWNED); runq_remove(&runq, ke); ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; } struct kse * sched_choose(void) { struct kse *ke; ke = runq_choose(&runq); if (ke != NULL) { runq_remove(&runq, ke); ke->ke_state = KES_THREAD; KASSERT((ke->ke_thread != NULL), ("runq_choose: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("runq_choose: No KSE on thread")); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_choose: process swapped out")); } return (ke); } void sched_userret(struct thread *td) { struct ksegrp *kg; /* * XXX we cheat slightly on the locking here to avoid locking in * the usual case. Setting td_priority here is essentially an * incomplete workaround for not setting it properly elsewhere. * Now that some interrupt handlers are threads, not setting it * properly elsewhere can clobber it in the window between setting * it here and returning to user mode, so don't waste time setting * it perfectly here. */ kg = td->td_ksegrp; if (td->td_priority != kg->kg_user_pri) { mtx_lock_spin(&sched_lock); td->td_priority = kg->kg_user_pri; mtx_unlock_spin(&sched_lock); } } int sched_sizeof_kse(void) { return (sizeof(struct kse) + sizeof(struct ke_sched)); } int sched_sizeof_ksegrp(void) { return (sizeof(struct ksegrp)); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread)); } fixpt_t sched_pctcpu(struct thread *td) { return (td->td_kse->ke_pctcpu); } diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 71646b6a22de..1045122bcd67 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -1,1358 +1,1359 @@ /*- * Copyright (c) 2002-2003, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KTRACE #include #include #endif #include #define KTR_ULE KTR_NFS /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ /* XXX This is bogus compatability crap for ps */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); static int sched_strict; SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); static int slice_min = 1; SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); static int slice_max = 10; SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); int realstathz; int tickincr = 1; #ifdef SMP /* Callout to handle load balancing SMP systems. */ static struct callout kseq_lb_callout; #endif /* * These datastructures are allocated within their parent datastructure but * are scheduler specific. */ struct ke_sched { int ske_slice; struct runq *ske_runq; /* The following variables are only used for pctcpu calculation */ int ske_ltick; /* Last tick that we were running on */ int ske_ftick; /* First tick that we were running on */ int ske_ticks; /* Tick count */ /* CPU that we have affinity for. */ u_char ske_cpu; }; #define ke_slice ke_sched->ske_slice #define ke_runq ke_sched->ske_runq #define ke_ltick ke_sched->ske_ltick #define ke_ftick ke_sched->ske_ftick #define ke_ticks ke_sched->ske_ticks #define ke_cpu ke_sched->ske_cpu struct kg_sched { int skg_slptime; /* Number of ticks we vol. slept */ int skg_runtime; /* Number of ticks we were running */ }; #define kg_slptime kg_sched->skg_slptime #define kg_runtime kg_sched->skg_runtime struct td_sched { int std_slptime; }; #define td_slptime td_sched->std_slptime struct td_sched td_sched; struct ke_sched ke_sched; struct kg_sched kg_sched; struct ke_sched *kse0_sched = &ke_sched; struct kg_sched *ksegrp0_sched = &kg_sched; struct p_sched *proc0_sched = NULL; struct td_sched *thread0_sched = &td_sched; /* * The priority is primarily determined by the interactivity score. Thus, we * give lower(better) priorities to kse groups that use less CPU. The nice * value is then directly added to this to allow nice to have some effect * on latency. * * PRI_RANGE: Total priority range for timeshare threads. * PRI_NRESV: Number of nice values. * PRI_BASE: The start of the dynamic range. */ #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define SCHED_PRI_NRESV PRIO_TOTAL #define SCHED_PRI_NHALF (PRIO_TOTAL / 2) #define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) #define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) #define SCHED_PRI_INTERACT(score) \ ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) /* * These determine the interactivity of a process. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_THROTTLE: Divisor for reducing slp/run time at fork time. * INTERACT_MAX: Maximum interactivity value. Smaller is better. * INTERACT_THRESH: Threshhold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz * 5) << 10) #define SCHED_SLP_RUN_THROTTLE (100) #define SCHED_INTERACT_MAX (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) #define SCHED_INTERACT_THRESH (30) /* * These parameters and macros determine the size of the time slice that is * granted to each thread. * * SLICE_MIN: Minimum time slice granted, in units of ticks. * SLICE_MAX: Maximum time slice granted. * SLICE_RANGE: Range of available time slices scaled by hz. * SLICE_SCALE: The number slices granted per val in the range of [0, max]. * SLICE_NICE: Determine the amount of slice granted to a scaled nice. */ #define SCHED_SLICE_MIN (slice_min) #define SCHED_SLICE_MAX (slice_max) #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) #define SCHED_SLICE_NICE(nice) \ (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) /* * This macro determines whether or not the kse belongs on the current or * next run queue. * * XXX nice value should effect how interactive a kg is. */ #define SCHED_INTERACTIVE(kg) \ (sched_interact_score(kg) < SCHED_INTERACT_THRESH) #define SCHED_CURR(kg, ke) \ (ke->ke_thread->td_priority != kg->kg_user_pri || \ SCHED_INTERACTIVE(kg)) /* * Cpu percentage computation macros and defines. * * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. */ #define SCHED_CPU_TIME 10 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) /* * kseq - per processor runqs and statistics. */ #define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ struct kseq { struct runq ksq_idle; /* Queue of IDLE threads. */ struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ struct runq *ksq_next; /* Next timeshare queue. */ struct runq *ksq_curr; /* Current queue. */ int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ int ksq_load; /* Aggregate load. */ short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ short ksq_nicemin; /* Least nice. */ #ifdef SMP int ksq_cpus; /* Count of CPUs in this kseq. */ unsigned int ksq_rslices; /* Slices on run queue */ #endif }; /* * One kse queue per processor. */ #ifdef SMP struct kseq kseq_cpu[MAXCPU]; struct kseq *kseq_idmap[MAXCPU]; #define KSEQ_SELF() (kseq_idmap[PCPU_GET(cpuid)]) #define KSEQ_CPU(x) (kseq_idmap[(x)]) #else struct kseq kseq_cpu; #define KSEQ_SELF() (&kseq_cpu) #define KSEQ_CPU(x) (&kseq_cpu) #endif static void sched_slice(struct kse *ke); static void sched_priority(struct ksegrp *kg); static int sched_interact_score(struct ksegrp *kg); static void sched_interact_update(struct ksegrp *kg); void sched_pctcpu_update(struct kse *ke); int sched_pickcpu(void); /* Operations on per processor queues */ static struct kse * kseq_choose(struct kseq *kseq, int steal); static void kseq_setup(struct kseq *kseq); static void kseq_add(struct kseq *kseq, struct kse *ke); static void kseq_rem(struct kseq *kseq, struct kse *ke); static void kseq_nice_add(struct kseq *kseq, int nice); static void kseq_nice_rem(struct kseq *kseq, int nice); void kseq_print(int cpu); #ifdef SMP struct kseq * kseq_load_highest(void); void kseq_balance(void *arg); void kseq_move(struct kseq *from, int cpu); #endif void kseq_print(int cpu) { struct kseq *kseq; int i; kseq = KSEQ_CPU(cpu); printf("kseq:\n"); printf("\tload: %d\n", kseq->ksq_load); printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); printf("\tnice counts:\n"); for (i = 0; i < PRIO_TOTAL + 1; i++) if (kseq->ksq_nice[i]) printf("\t\t%d = %d\n", i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); } static void kseq_add(struct kseq *kseq, struct kse *ke) { mtx_assert(&sched_lock, MA_OWNED); kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; kseq->ksq_load++; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); #ifdef SMP kseq->ksq_rslices += ke->ke_slice; #endif } static void kseq_rem(struct kseq *kseq, struct kse *ke) { mtx_assert(&sched_lock, MA_OWNED); kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; kseq->ksq_load--; ke->ke_runq = NULL; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); #ifdef SMP kseq->ksq_rslices -= ke->ke_slice; #endif } static void kseq_nice_add(struct kseq *kseq, int nice) { mtx_assert(&sched_lock, MA_OWNED); /* Normalize to zero. */ kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) kseq->ksq_nicemin = nice; } static void kseq_nice_rem(struct kseq *kseq, int nice) { int n; mtx_assert(&sched_lock, MA_OWNED); /* Normalize to zero. */ n = nice + SCHED_PRI_NHALF; kseq->ksq_nice[n]--; KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); /* * If this wasn't the smallest nice value or there are more in * this bucket we can just return. Otherwise we have to recalculate * the smallest nice. */ if (nice != kseq->ksq_nicemin || kseq->ksq_nice[n] != 0 || kseq->ksq_loads[PRI_TIMESHARE] == 0) return; for (; n < SCHED_PRI_NRESV + 1; n++) if (kseq->ksq_nice[n]) { kseq->ksq_nicemin = n - SCHED_PRI_NHALF; return; } } #ifdef SMP /* * kseq_balance is a simple CPU load balancing algorithm. It operates by * finding the least loaded and most loaded cpu and equalizing their load * by migrating some processes. * * Dealing only with two CPUs at a time has two advantages. Firstly, most * installations will only have 2 cpus. Secondly, load balancing too much at * once can have an unpleasant effect on the system. The scheduler rarely has * enough information to make perfect decisions. So this algorithm chooses * algorithm simplicity and more gradual effects on load in larger systems. * * It could be improved by considering the priorities and slices assigned to * each task prior to balancing them. There are many pathological cases with * any approach and so the semi random algorithm below may work as well as any. * */ void kseq_balance(void *arg) { struct kseq *kseq; int high_load; int low_load; int high_cpu; int low_cpu; int move; int diff; int i; high_cpu = 0; low_cpu = 0; high_load = 0; low_load = -1; mtx_lock_spin(&sched_lock); if (smp_started == 0) goto out; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load > high_load) { high_load = kseq->ksq_load; high_cpu = i; } if (low_load == -1 || kseq->ksq_load < low_load) { low_load = kseq->ksq_load; low_cpu = i; } } kseq = KSEQ_CPU(high_cpu); /* * Nothing to do. */ if (high_load < kseq->ksq_cpus + 1) goto out; high_load -= kseq->ksq_cpus; if (low_load >= high_load) goto out; diff = high_load - low_load; move = diff / 2; if (diff & 0x1) move++; for (i = 0; i < move; i++) kseq_move(kseq, low_cpu); out: mtx_unlock_spin(&sched_lock); callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); return; } struct kseq * kseq_load_highest(void) { struct kseq *kseq; int load; int cpu; int i; mtx_assert(&sched_lock, MA_OWNED); cpu = 0; load = 0; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load > load) { load = kseq->ksq_load; cpu = i; } } kseq = KSEQ_CPU(cpu); if (load > kseq->ksq_cpus) return (kseq); return (NULL); } void kseq_move(struct kseq *from, int cpu) { struct kse *ke; ke = kseq_choose(from, 1); runq_remove(ke->ke_runq, ke); ke->ke_state = KES_THREAD; kseq_rem(from, ke); ke->ke_cpu = cpu; sched_add(ke); } #endif /* * Pick the highest priority task we have and return it. If steal is 1 we * will return kses that have been denied slices due to their nice being too * low. In the future we should prohibit stealing interrupt threads as well. */ struct kse * kseq_choose(struct kseq *kseq, int steal) { struct kse *ke; struct runq *swap; mtx_assert(&sched_lock, MA_OWNED); swap = NULL; for (;;) { ke = runq_choose(kseq->ksq_curr); if (ke == NULL) { /* * We already swaped once and didn't get anywhere. */ if (swap) break; swap = kseq->ksq_curr; kseq->ksq_curr = kseq->ksq_next; kseq->ksq_next = swap; continue; } /* * If we encounter a slice of 0 the kse is in a * TIMESHARE kse group and its nice was too far out * of the range that receives slices. */ if (ke->ke_slice == 0 && steal == 0) { runq_remove(ke->ke_runq, ke); sched_slice(ke); ke->ke_runq = kseq->ksq_next; runq_add(ke->ke_runq, ke); continue; } return (ke); } return (runq_choose(&kseq->ksq_idle)); } static void kseq_setup(struct kseq *kseq) { runq_init(&kseq->ksq_timeshare[0]); runq_init(&kseq->ksq_timeshare[1]); runq_init(&kseq->ksq_idle); kseq->ksq_curr = &kseq->ksq_timeshare[0]; kseq->ksq_next = &kseq->ksq_timeshare[1]; kseq->ksq_loads[PRI_ITHD] = 0; kseq->ksq_loads[PRI_REALTIME] = 0; kseq->ksq_loads[PRI_TIMESHARE] = 0; kseq->ksq_loads[PRI_IDLE] = 0; kseq->ksq_load = 0; #ifdef SMP kseq->ksq_rslices = 0; #endif } static void sched_setup(void *dummy) { #ifdef SMP int i; #endif slice_min = (hz/100); /* 10ms */ slice_max = (hz/7); /* ~140ms */ #ifdef SMP /* init kseqs */ /* Create the idmap. */ #ifdef ULE_HTT_EXPERIMENTAL if (smp_topology == NULL) { #else if (1) { #endif for (i = 0; i < MAXCPU; i++) { kseq_setup(&kseq_cpu[i]); kseq_idmap[i] = &kseq_cpu[i]; kseq_cpu[i].ksq_cpus = 1; } } else { int j; for (i = 0; i < smp_topology->ct_count; i++) { struct cpu_group *cg; cg = &smp_topology->ct_group[i]; kseq_setup(&kseq_cpu[i]); for (j = 0; j < MAXCPU; j++) if ((cg->cg_mask & (1 << j)) != 0) kseq_idmap[j] = &kseq_cpu[i]; kseq_cpu[i].ksq_cpus = cg->cg_count; } } callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); kseq_balance(NULL); #else kseq_setup(KSEQ_SELF()); #endif mtx_lock_spin(&sched_lock); kseq_add(KSEQ_SELF(), &kse0); mtx_unlock_spin(&sched_lock); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct ksegrp *kg) { int pri; if (kg->kg_pri_class != PRI_TIMESHARE) return; pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); pri += SCHED_PRI_BASE; pri += kg->kg_nice; if (pri > PRI_MAX_TIMESHARE) pri = PRI_MAX_TIMESHARE; else if (pri < PRI_MIN_TIMESHARE) pri = PRI_MIN_TIMESHARE; kg->kg_user_pri = pri; return; } /* * Calculate a time slice based on the properties of the kseg and the runq * that we're on. This is only for PRI_TIMESHARE ksegrps. */ static void sched_slice(struct kse *ke) { struct kseq *kseq; struct ksegrp *kg; kg = ke->ke_ksegrp; kseq = KSEQ_CPU(ke->ke_cpu); /* * Rationale: * KSEs in interactive ksegs get the minimum slice so that we * quickly notice if it abuses its advantage. * * KSEs in non-interactive ksegs are assigned a slice that is * based on the ksegs nice value relative to the least nice kseg * on the run queue for this cpu. * * If the KSE is less nice than all others it gets the maximum * slice and other KSEs will adjust their slice relative to * this when they first expire. * * There is 20 point window that starts relative to the least * nice kse on the run queue. Slice size is determined by * the kse distance from the last nice ksegrp. * * If you are outside of the window you will get no slice and * you will be reevaluated each time you are selected on the * run queue. * */ if (!SCHED_INTERACTIVE(kg)) { int nice; nice = kg->kg_nice + (0 - kseq->ksq_nicemin); if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || kg->kg_nice < kseq->ksq_nicemin) ke->ke_slice = SCHED_SLICE_MAX; else if (nice <= SCHED_PRI_NTHRESH) ke->ke_slice = SCHED_SLICE_NICE(nice); else ke->ke_slice = 0; } else ke->ke_slice = SCHED_SLICE_MIN; CTR6(KTR_ULE, "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); /* * Check to see if we need to scale back the slp and run time * in the kg. This will cause us to forget old interactivity * while maintaining the current ratio. */ sched_interact_update(kg); return; } static void sched_interact_update(struct ksegrp *kg) { /* XXX Fixme, use a linear algorithm and not a while loop. */ while ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { kg->kg_runtime = (kg->kg_runtime / 5) * 4; kg->kg_slptime = (kg->kg_slptime / 5) * 4; } } static int sched_interact_score(struct ksegrp *kg) { int div; if (kg->kg_runtime > kg->kg_slptime) { div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); } if (kg->kg_slptime > kg->kg_runtime) { div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); return (kg->kg_runtime / div); } /* * This can happen if slptime and runtime are 0. */ return (0); } /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most SCHED_SLICE_MAX. */ int sched_rr_interval(void) { return (SCHED_SLICE_MAX); } void sched_pctcpu_update(struct kse *ke) { /* * Adjust counters and watermark for pctcpu calc. */ if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { /* * Shift the tick count out so that the divide doesn't * round away our results. */ ke->ke_ticks <<= 10; ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * SCHED_CPU_TICKS; ke->ke_ticks >>= 10; } else ke->ke_ticks = 0; ke->ke_ltick = ticks; ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; } #ifdef SMP /* XXX Should be changed to kseq_load_lowest() */ int sched_pickcpu(void) { struct kseq *kseq; int load; int cpu; int i; mtx_assert(&sched_lock, MA_OWNED); if (!smp_started) return (0); load = 0; cpu = 0; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load < load) { cpu = i; load = kseq->ksq_load; } } CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); return (cpu); } #else int sched_pickcpu(void) { return (0); } #endif void sched_prio(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); if (TD_ON_RUNQ(td)) { adjustrunqueue(td, prio); } else { td->td_priority = prio; } } void -sched_switchout(struct thread *td) +sched_switch(struct thread *td) { + struct thread *newtd; + u_int sched_nest; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; td->td_last_kse = ke; td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; td->td_flags &= ~TDF_NEEDRESCHED; if (TD_IS_RUNNING(td)) { if (td->td_proc->p_flag & P_SA) { kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); setrunqueue(td); } else { /* * This queue is always correct except for idle threads which * have a higher priority due to priority propagation. */ if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE && ke->ke_thread->td_priority > PRI_MIN_IDLE) ke->ke_runq = KSEQ_SELF()->ksq_curr; runq_add(ke->ke_runq, ke); /* setrunqueue(td); */ } return; } if (ke->ke_runq) kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); /* * We will not be on the run queue. So we must be * sleeping or similar. */ if (td->td_proc->p_flag & P_SA) kse_reassign(ke); -} - -void -sched_switchin(struct thread *td) -{ - /* struct kse *ke = td->td_kse; */ - mtx_assert(&sched_lock, MA_OWNED); + sched_nest = sched_lock.mtx_recurse; + newtd = choosethread(); + if (td != newtd) + cpu_switch(td, newtd); + sched_lock.mtx_recurse = sched_nest; + sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); } void sched_nice(struct ksegrp *kg, int nice) { struct kse *ke; struct thread *td; struct kseq *kseq; PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); /* * We need to adjust the nice counts for running KSEs. */ if (kg->kg_pri_class == PRI_TIMESHARE) FOREACH_KSE_IN_GROUP(kg, ke) { if (ke->ke_runq == NULL) continue; kseq = KSEQ_CPU(ke->ke_cpu); kseq_nice_rem(kseq, kg->kg_nice); kseq_nice_add(kseq, nice); } kg->kg_nice = nice; sched_priority(kg); FOREACH_THREAD_IN_GROUP(kg, td) td->td_flags |= TDF_NEEDRESCHED; } void sched_sleep(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); td->td_slptime = ticks; td->td_priority = prio; CTR2(KTR_ULE, "sleep kse %p (tick: %d)", td->td_kse, td->td_slptime); } void sched_wakeup(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); /* * Let the kseg know how long we slept for. This is because process * interactivity behavior is modeled in the kseg. */ if (td->td_slptime) { struct ksegrp *kg; int hzticks; kg = td->td_ksegrp; hzticks = ticks - td->td_slptime; kg->kg_slptime += hzticks << 10; sched_interact_update(kg); sched_priority(kg); if (td->td_kse) sched_slice(td->td_kse); CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", td->td_kse, hzticks); td->td_slptime = 0; } setrunqueue(td); if (td->td_priority < curthread->td_priority) curthread->td_flags |= TDF_NEEDRESCHED; } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct proc *p, struct proc *p1) { mtx_assert(&sched_lock, MA_OWNED); sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_fork_kse(struct kse *ke, struct kse *child) { child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ child->ke_runq = NULL; /* Grab our parents cpu estimation information. */ child->ke_ticks = ke->ke_ticks; child->ke_ltick = ke->ke_ltick; child->ke_ftick = ke->ke_ftick; } void sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) { PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); /* XXX Need something better here */ child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE; child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE; kg->kg_runtime += tickincr << 10; sched_interact_update(kg); child->kg_user_pri = kg->kg_user_pri; child->kg_nice = kg->kg_nice; } void sched_fork_thread(struct thread *td, struct thread *child) { } void sched_class(struct ksegrp *kg, int class) { struct kseq *kseq; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); if (kg->kg_pri_class == class) return; FOREACH_KSE_IN_GROUP(kg, ke) { if (ke->ke_state != KES_ONRUNQ && ke->ke_state != KES_THREAD) continue; kseq = KSEQ_CPU(ke->ke_cpu); kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; kseq->ksq_loads[PRI_BASE(class)]++; if (kg->kg_pri_class == PRI_TIMESHARE) kseq_nice_rem(kseq, kg->kg_nice); else if (class == PRI_TIMESHARE) kseq_nice_add(kseq, kg->kg_nice); } kg->kg_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct proc *child) { /* XXX Need something better here */ mtx_assert(&sched_lock, MA_OWNED); sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); } void sched_exit_kse(struct kse *ke, struct kse *child) { kseq_rem(KSEQ_CPU(child->ke_cpu), child); } void sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) { /* kg->kg_slptime += child->kg_slptime; */ kg->kg_runtime += child->kg_runtime; sched_interact_update(kg); } void sched_exit_thread(struct thread *td, struct thread *child) { } void sched_clock(struct thread *td) { struct kseq *kseq; struct ksegrp *kg; struct kse *ke; #if 0 struct kse *nke; #endif /* * sched_setup() apparently happens prior to stathz being set. We * need to resolve the timers earlier in the boot so we can avoid * calculating this here. */ if (realstathz == 0) { realstathz = stathz ? stathz : hz; tickincr = hz / realstathz; /* * XXX This does not work for values of stathz that are much * larger than hz. */ if (tickincr == 0) tickincr = 1; } ke = td->td_kse; kg = ke->ke_ksegrp; mtx_assert(&sched_lock, MA_OWNED); KASSERT((td != NULL), ("schedclock: null thread pointer")); /* Adjust ticks for pctcpu */ ke->ke_ticks++; ke->ke_ltick = ticks; /* Go up to one second beyond our max and then trim back down */ if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) sched_pctcpu_update(ke); if (td->td_flags & TDF_IDLETD) return; CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); /* * We only do slicing code for TIMESHARE ksegrps. */ if (kg->kg_pri_class != PRI_TIMESHARE) return; /* * Check for a higher priority task on the run queue. This can happen * on SMP if another processor woke up a process on our runq. */ kseq = KSEQ_SELF(); #if 0 if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq, 0)) != NULL) { if (sched_strict && nke->ke_thread->td_priority < td->td_priority) td->td_flags |= TDF_NEEDRESCHED; else if (nke->ke_thread->td_priority < td->td_priority SCHED_PRIO_SLOP) if (nke->ke_thread->td_priority < td->td_priority) td->td_flags |= TDF_NEEDRESCHED; } #endif /* * We used a tick charge it to the ksegrp so that we can compute our * interactivity. */ kg->kg_runtime += tickincr << 10; sched_interact_update(kg); /* * We used up one time slice. */ ke->ke_slice--; #ifdef SMP kseq->ksq_rslices--; #endif if (ke->ke_slice > 0) return; /* * We're out of time, recompute priorities and requeue. */ kseq_rem(kseq, ke); sched_priority(kg); sched_slice(ke); if (SCHED_CURR(kg, ke)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; kseq_add(kseq, ke); td->td_flags |= TDF_NEEDRESCHED; } int sched_runnable(void) { struct kseq *kseq; int load; load = 1; mtx_lock_spin(&sched_lock); kseq = KSEQ_SELF(); if (kseq->ksq_load) goto out; #ifdef SMP /* * For SMP we may steal other processor's KSEs. Just search until we * verify that at least on other cpu has a runnable task. */ if (smp_started) { int i; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load > kseq->ksq_cpus) goto out; } } #endif load = 0; out: mtx_unlock_spin(&sched_lock); return (load); } void sched_userret(struct thread *td) { struct ksegrp *kg; #if 0 struct kseq *kseq; struct kse *ke; #endif kg = td->td_ksegrp; if (td->td_priority != kg->kg_user_pri) { mtx_lock_spin(&sched_lock); td->td_priority = kg->kg_user_pri; /* * This optimization is temporarily disabled because it * breaks priority propagation. */ #if 0 kseq = KSEQ_SELF(); if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && #ifdef SMP kseq->ksq_load > kseq->ksq_cpus && #else kseq->ksq_load > 1 && #endif (ke = kseq_choose(kseq, 0)) != NULL && ke->ke_thread->td_priority < td->td_priority) #endif curthread->td_flags |= TDF_NEEDRESCHED; mtx_unlock_spin(&sched_lock); } } struct kse * sched_choose(void) { struct kseq *kseq; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); #ifdef SMP retry: #endif kseq = KSEQ_SELF(); ke = kseq_choose(kseq, 0); if (ke) { runq_remove(ke->ke_runq, ke); ke->ke_state = KES_THREAD; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority); } return (ke); } #ifdef SMP if (smp_started) { /* * Find the cpu with the highest load and steal one proc. */ if ((kseq = kseq_load_highest()) == NULL) return (NULL); /* * Remove this kse from this kseq and runq and then requeue * on the current processor. Then we will dequeue it * normally above. */ kseq_move(kseq, PCPU_GET(cpuid)); goto retry; } #endif return (NULL); } void sched_add(struct thread *td) { struct kseq *kseq; struct ksegrp *kg; struct kse *ke; ke = td->td_kse; kg = td->td_ksegrp; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("sched_add: No KSE on thread")); KASSERT(ke->ke_state != KES_ONRUNQ, ("sched_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("sched_add: process swapped out")); KASSERT(ke->ke_runq == NULL, ("sched_add: KSE %p is still assigned to a run queue", ke)); switch (PRI_BASE(kg->kg_pri_class)) { case PRI_ITHD: case PRI_REALTIME: kseq = KSEQ_SELF(); ke->ke_runq = kseq->ksq_curr; ke->ke_slice = SCHED_SLICE_MAX; ke->ke_cpu = PCPU_GET(cpuid); break; case PRI_TIMESHARE: kseq = KSEQ_CPU(ke->ke_cpu); if (SCHED_CURR(kg, ke)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; break; case PRI_IDLE: kseq = KSEQ_CPU(ke->ke_cpu); /* * This is for priority prop. */ if (ke->ke_thread->td_priority > PRI_MIN_IDLE) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = &kseq->ksq_idle; ke->ke_slice = SCHED_SLICE_MIN; break; default: panic("Unknown pri class.\n"); break; } ke->ke_ksegrp->kg_runq_kses++; ke->ke_state = KES_ONRUNQ; runq_add(ke->ke_runq, ke); kseq_add(kseq, ke); } void sched_rem(struct thread *td) { struct kseq *kseq; struct kse *ke; ke = td->td_kse; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; kseq = KSEQ_CPU(ke->ke_cpu); runq_remove(ke->ke_runq, ke); kseq_rem(kseq, ke); } fixpt_t sched_pctcpu(struct thread *td) { fixpt_t pctcpu; struct kse *ke; pctcpu = 0; ke = td->td_kse; mtx_lock_spin(&sched_lock); if (ke->ke_ticks) { int rtick; /* * Don't update more frequently than twice a second. Allowing * this causes the cpu usage to decay away too quickly due to * rounding errors. */ if (ke->ke_ltick < (ticks - (hz / 2))) sched_pctcpu_update(ke); /* How many rtick per second ? */ rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; } ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; mtx_unlock_spin(&sched_lock); return (pctcpu); } int sched_sizeof_kse(void) { return (sizeof(struct kse) + sizeof(struct ke_sched)); } int sched_sizeof_ksegrp(void) { return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } diff --git a/sys/sys/sched.h b/sys/sys/sched.h index 7fc9df23cd99..cd804cc559a5 100644 --- a/sys/sys/sched.h +++ b/sys/sys/sched.h @@ -1,95 +1,94 @@ /*- * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_SCHED_H_ #define _SYS_SCHED_H_ /* * General scheduling info. */ int sched_rr_interval(void); int sched_runnable(void); /* * Proc related scheduling hooks. */ void sched_exit(struct proc *p, struct proc *child); void sched_fork(struct proc *p, struct proc *child); /* * KSE Groups contain scheduling priority information. They record the * behavior of groups of KSEs and threads. */ void sched_class(struct ksegrp *kg, int class); void sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child); void sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child); void sched_nice(struct ksegrp *kg, int nice); /* * Threads are switched in and out, block on resources, and have temporary * priorities inherited from their ksegs. */ void sched_exit_thread(struct thread *td, struct thread *child); void sched_fork_thread(struct thread *td, struct thread *child); void sched_prio(struct thread *td, u_char prio); void sched_sleep(struct thread *td, u_char prio); -void sched_switchin(struct thread *td); -void sched_switchout(struct thread *td); +void sched_switch(struct thread *td); void sched_userret(struct thread *td); void sched_wakeup(struct thread *td); /* * KSEs are moved on and off of run queues. */ void sched_add(struct thread *td); struct kse *sched_choose(void); void sched_clock(struct thread *td); void sched_exit_kse(struct kse *ke, struct kse *child); void sched_fork_kse(struct kse *ke, struct kse *child); void sched_rem(struct thread *td); /* * and they use up cpu time. */ fixpt_t sched_pctcpu(struct thread *td); /* * These procedures tell the process data structure allocation code how * many bytes to actually allocate. */ int sched_sizeof_kse(void); int sched_sizeof_ksegrp(void); int sched_sizeof_proc(void); int sched_sizeof_thread(void); extern struct ke_sched *kse0_sched; extern struct kg_sched *ksegrp0_sched; extern struct p_sched *proc0_sched; extern struct td_sched *thread0_sched; #endif /* !_SYS_SCHED_H_ */