Index: head/sys/kern/kern_fork.c =================================================================== --- head/sys/kern/kern_fork.c (revision 121681) +++ head/sys/kern/kern_fork.c (revision 121682) @@ -1,835 +1,835 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif static int forksleep; /* Place for fork1() to sleep on. */ /* * MPSAFE */ /* ARGSUSED */ int fork(td, uap) struct thread *td; struct fork_args *uap; { int error; struct proc *p2; error = fork1(td, RFFDG | RFPROC, 0, &p2); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } return (error); } /* * MPSAFE */ /* ARGSUSED */ int vfork(td, uap) struct thread *td; struct vfork_args *uap; { int error; struct proc *p2; error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } return (error); } /* * MPSAFE */ int rfork(td, uap) struct thread *td; struct rfork_args *uap; { int error; struct proc *p2; /* Don't allow kernel only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); error = fork1(td, uap->flags, 0, &p2); if (error == 0) { td->td_retval[0] = p2 ? p2->p_pid : 0; td->td_retval[1] = 0; } return (error); } int nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; sysctl_wire_old_buffer(req, sizeof(int)); sx_xlock(&allproc_lock); pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid < 0 || pid > PID_MAX - 100) /* out of range */ pid = PID_MAX - 100; else if (pid < 2) /* NOP */ pid = 0; else if (pid < 100) /* Make it reasonable */ pid = 100; randompid = pid; } sx_xunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); int fork1(td, flags, pages, procp) struct thread *td; int flags; int pages; struct proc **procp; { struct proc *p1, *p2, *pptr; uid_t uid; struct proc *newproc; int ok, trypid; static int curfail, pidchecked = 0; static struct timeval lastfail; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct thread *td2; struct kse *ke2; struct ksegrp *kg2; struct sigacts *newsigacts; int error; /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); p1 = td->td_proc; mtx_lock(&Giant); /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { vm_forkproc(td, NULL, NULL, flags); /* * Close all file descriptors. */ if (flags & RFCFDG) { struct filedesc *fdtmp; fdtmp = fdinit(td->td_proc->p_fd); fdfree(td); p1->p_fd = fdtmp; } /* * Unshare file descriptors (from parent.) */ if (flags & RFFDG) { FILEDESC_LOCK(p1->p_fd); if (p1->p_fd->fd_refcnt > 1) { struct filedesc *newfd; newfd = fdcopy(td->td_proc->p_fd); FILEDESC_UNLOCK(p1->p_fd); fdfree(td); p1->p_fd = newfd; } else FILEDESC_UNLOCK(p1->p_fd); } mtx_unlock(&Giant); *procp = NULL; return (0); } /* * Note 1:1 allows for forking with one thread coming out on the * other side with the expectation that the process is about to * exec. */ if (p1->p_flag & P_SA) { /* * Idle the other threads for a second. * Since the user space is copied, it must remain stable. * In addition, all threads (from the user perspective) * need to either be suspended or in the kernel, * where they will try restart in the parent and will * be aborted in the child. */ PROC_LOCK(p1); if (thread_single(SINGLE_NO_EXIT)) { /* Abort.. someone else is single threading before us */ PROC_UNLOCK(p1); mtx_unlock(&Giant); return (ERESTART); } PROC_UNLOCK(p1); /* * All other activity in this process * is now suspended at the user boundary, * (or other safe places if we think of any). */ } /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); #ifdef MAC mac_init_proc(newproc); #endif /* * Although process entries are dynamically created, we still keep * a global limit on the maximum number we will create. Don't allow * a nonprivileged user to use the last ten processes; don't let root * exceed the limit. The variable nprocs is the current number of * processes, maxproc is the limit. */ sx_xlock(&allproc_lock); uid = td->td_ucred->cr_ruid; if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) { error = EAGAIN; goto fail; } /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. */ PROC_LOCK(p1); ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0); PROC_UNLOCK(p1); if (!ok) { error = EAGAIN; goto fail; } /* * Increment the nprocs resource before blocking can occur. There * are hard-limits as to the number of processes that can run. */ nprocs++; /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from lastpid+1 through pidchecked-1). * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { if (randompid) trypid += arc4random() % randompid; } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ if (trypid >= PID_MAX) { trypid = trypid % PID_MAX; if (trypid < 100) trypid += 100; pidchecked = 0; } if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater * than trypid, so we can avoid checking for a while. */ p2 = LIST_FIRST(&allproc); again: for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) { PROC_LOCK(p2); while (p2->p_pid == trypid || p2->p_pgrp->pg_id == trypid || p2->p_session->s_sid == trypid) { trypid++; if (trypid >= pidchecked) { PROC_UNLOCK(p2); goto retry; } } if (p2->p_pid > trypid && pidchecked > p2->p_pid) pidchecked = p2->p_pid; if (p2->p_pgrp->pg_id > trypid && pidchecked > p2->p_pgrp->pg_id) pidchecked = p2->p_pgrp->pg_id; if (p2->p_session->s_sid > trypid && pidchecked > p2->p_session->s_sid) pidchecked = p2->p_session->s_sid; PROC_UNLOCK(p2); } if (!doingzomb) { doingzomb = 1; p2 = LIST_FIRST(&zombproc); goto again; } } /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) pidchecked = 0; else lastpid = trypid; p2 = newproc; p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; LIST_INSERT_HEAD(&allproc, p2, p_list); LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); sx_xunlock(&allproc_lock); /* * Malloc things while we don't hold any locks. */ if (flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (flags & RFCFDG) { fd = fdinit(td->td_proc->p_fd); fdtol = NULL; } else if (flags & RFFDG) { FILEDESC_LOCK(p1->p_fd); fd = fdcopy(td->td_proc->p_fd); FILEDESC_UNLOCK(p1->p_fd); fdtol = NULL; } else { fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((flags & RFTHREAD) != 0) { /* * Shared file descriptor table and * shared process leaders. */ fdtol = p1->p_fdtol; FILEDESC_LOCK(p1->p_fd); fdtol->fdl_refcount++; FILEDESC_UNLOCK(p1->p_fd); } else { /* * Shared file descriptor table, and * different process leaders */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ td2 = FIRST_THREAD_IN_PROC(p2); kg2 = FIRST_KSEGRP_IN_PROC(p2); ke2 = FIRST_KSE_IN_KSEGRP(kg2); /* Allocate and switch to an alternate kstack if specified */ if (pages != 0) vm_thread_new_altkstack(td2, pages); PROC_LOCK(p2); PROC_LOCK(p1); #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start)) bzero(&p2->p_startzero, (unsigned) RANGEOF(struct proc, p_startzero, p_endzero)); bzero(&ke2->ke_startzero, (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero)); bzero(&td2->td_startzero, (unsigned) RANGEOF(struct thread, td_startzero, td_endzero)); bzero(&kg2->kg_startzero, (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero)); bcopy(&p1->p_startcopy, &p2->p_startcopy, (unsigned) RANGEOF(struct proc, p_startcopy, p_endcopy)); bcopy(&td->td_startcopy, &td2->td_startcopy, (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy)); bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy, (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy)); #undef RANGEOF /* Set up the thread as an active thread (as if runnable). */ ke2->ke_state = KES_THREAD; ke2->ke_thread = td2; td2->td_kse = ke2; /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. * The p_stats substruct is set in vm_forkproc. */ p2->p_flag = 0; if (p1->p_flag & P_PROFIL) startprofclock(p2); mtx_lock_spin(&sched_lock); p2->p_sflag = PS_INMEM; /* * Allow the scheduler to adjust the priority of the child and * parent while we hold the sched_lock. */ sched_fork(p1, p2); mtx_unlock_spin(&sched_lock); p2->p_ucred = crhold(td->td_ucred); td2->td_ucred = crhold(p2->p_ucred); /* XXXKSE */ pargs_hold(p2->p_args); if (flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } if (flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; /* Bump references to the text vnode (for procfs) */ p2->p_textvp = p1->p_textvp; if (p2->p_textvp) VREF(p2->p_textvp); p2->p_fd = fd; p2->p_fdtol = fdtol; PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* * p_limit is copy-on-write, bump refcnt, */ p2->p_limit = p1->p_limit; p2->p_limit->p_refcnt++; /* * Setup linkage for kernel based threading */ if((flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & (P_ALTSTACK | P_SUGID); SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); callout_init(&p2->p_itcallout, CALLOUT_MPSAFE); #ifdef KTRACE /* * Copy traceflag and tracefile if enabled. */ mtx_lock(&ktrace_mtx); KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode")); if (p1->p_traceflag & KTRFAC_INHERIT) { p2->p_traceflag = p1->p_traceflag; if ((p2->p_tracevp = p1->p_tracevp) != NULL) { VREF(p2->p_tracevp); KASSERT(p1->p_tracecred != NULL, ("ktrace vnode with no cred")); p2->p_tracecred = crhold(p1->p_tracecred); } } mtx_unlock(&ktrace_mtx); #endif /* * If PF_FORK is set, the child process inherits the * procfs ioctl flags from its parent. */ if (p1->p_pfsflags & PF_FORK) { p2->p_stops = p1->p_stops; p2->p_pfsflags = p1->p_pfsflags; } /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if (flags & RFNOWAIT) pptr = initproc; else pptr = p1; p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, td2, flags); if (flags == (RFFDG | RFPROC)) { cnt.v_forks++; cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { cnt.v_vforks++; cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } else if (p1 == &proc0) { cnt.v_kthreads++; cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } else { cnt.v_rforks++; cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } /* * Both processes are set up, now check if any loadable modules want * to adjust anything. * What if they have an error? XXX */ EVENTHANDLER_INVOKE(process_fork, p1, p2, flags); /* * If RFSTOPPED not requested, make child runnable and add to * run queue. */ microuptime(&p2->p_stats->p_start); if ((flags & RFSTOPPED) == 0) { mtx_lock_spin(&sched_lock); p2->p_state = PRS_NORMAL; TD_SET_CAN_RUN(td2); setrunqueue(td2); mtx_unlock_spin(&sched_lock); } /* * Now can be swapped. */ PROC_LOCK(p1); _PRELE(p1); /* * Tell any interested parties about the new process. */ KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid); PROC_UNLOCK(p1); /* * Preserve synchronization semantics of vfork. If waiting for * child to exec or exit, set P_PPWAIT on child, and sleep on our * proc (in case of exit). */ PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0); PROC_UNLOCK(p2); /* * If other threads are waiting, let them continue now */ if (p1->p_flag & P_SA) { PROC_LOCK(p1); thread_single_end(); PROC_UNLOCK(p1); } /* * Return child proc pointer to parent. */ mtx_unlock(&Giant); *procp = p2; return (0); fail: if (ppsratecheck(&lastfail, &curfail, 1)) printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n", uid); sx_xunlock(&allproc_lock); uma_zfree(proc_zone, newproc); if (p1->p_flag & P_SA) { PROC_LOCK(p1); thread_single_end(); PROC_UNLOCK(p1); } tsleep(&forksleep, PUSER, "fork", hz / 2); mtx_unlock(&Giant); return (error); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(callout, arg, frame) void (*callout)(void *, struct trapframe *); void *arg; struct trapframe *frame; { struct proc *p; struct thread *td; /* * Processes normally resume in mi_switch() after being * cpu_switch()'ed to, but when children start up they arrive here * instead, so we must do much the same things as mi_switch() would. */ if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } td = curthread; p = td->td_proc; td->td_oncpu = PCPU_GET(cpuid); p->p_state = PRS_NORMAL; /* * Finish setting up thread glue so that it begins execution in a * non-nested critical section with sched_lock held but not recursed. */ sched_lock.mtx_lock = (uintptr_t)td; - sched_lock.mtx_recurse = 0; + mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); cpu_critical_fork_exit(); CTR3(KTR_PROC, "fork_exit: new thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); if (PCPU_GET(switchtime.sec) == 0) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); mtx_unlock_spin(&sched_lock); /* * cpu_set_fork_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ PROC_LOCK(p); if (p->p_flag & P_KTHREAD) { PROC_UNLOCK(p); mtx_lock(&Giant); printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", p->p_comm, p->p_pid); kthread_exit(0); } PROC_UNLOCK(p); #ifdef DIAGNOSTIC cred_free_thread(td); #endif mtx_assert(&Giant, MA_NOTOWNED); } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. Giant is not held on entry, and must not * be held on return. This function is passed in to fork_exit() as the * first parameter and is called when returning to a new userland process. */ void fork_return(td, frame) struct thread *td; struct trapframe *frame; { userret(td, frame, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(SYS_fork, 0, 0); #endif mtx_assert(&Giant, MA_NOTOWNED); } Index: head/sys/kern/sched_4bsd.c =================================================================== --- head/sys/kern/sched_4bsd.c (revision 121681) +++ head/sys/kern/sched_4bsd.c (revision 121682) @@ -1,733 +1,730 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include /* * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in * the range 100-256 Hz (approximately). */ #define ESTCPULIM(e) \ min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ #define NICE_WEIGHT 1 /* Priorities per nice level. */ struct ke_sched { int ske_cpticks; /* (j) Ticks of cpu time. */ }; static struct ke_sched ke_sched; struct ke_sched *kse0_sched = &ke_sched; struct kg_sched *ksegrp0_sched = NULL; struct p_sched *proc0_sched = NULL; struct td_sched *thread0_sched = NULL; static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ static struct callout schedcpu_callout; static struct callout roundrobin_callout; static void roundrobin(void *arg); static void schedcpu(void *arg); static void sched_setup(void *dummy); static void maybe_resched(struct thread *td); static void updatepri(struct ksegrp *kg); static void resetpriority(struct ksegrp *kg); SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) /* * Global run queue. */ static struct runq runq; SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq) static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val; new_val = sched_quantum * tick; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val < tick) return (EINVAL); sched_quantum = new_val / tick; hogticks = 2 * sched_quantum; return (0); } SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof sched_quantum, sysctl_kern_quantum, "I", "Roundrobin scheduling quantum in microseconds"); /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. */ static void maybe_resched(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); if (td->td_priority < curthread->td_priority && curthread->td_kse) curthread->td_flags |= TDF_NEEDRESCHED; } /* * Force switch among equal priority processes every 100ms. * We don't actually need to force a context switch of the current process. * The act of firing the event triggers a context switch to softclock() and * then switching back out again which is equivalent to a preemption, thus * no further work is needed on the local CPU. */ /* ARGSUSED */ static void roundrobin(void *arg) { #ifdef SMP mtx_lock_spin(&sched_lock); forward_roundrobin(); mtx_unlock_spin(&sched_lock); #endif callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL); } /* * Constants for digital decay and forget: * 90% of (kg_estcpu) usage in 5 * loadav time * 95% of (ke_pctcpu) usage in 60 seconds (load insensitive) * Note that, as ps(1) mentions, this can let percentages * total over 100% (I've seen 137.9% for 3 processes). * * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously. * * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds. * That is, the system wants to compute a value of decay such * that the following for loop: * for (i = 0; i < (5 * loadavg); i++) * kg_estcpu *= decay; * will compute * kg_estcpu *= 0.1; * for all values of loadavg: * * Mathematically this loop can be expressed by saying: * decay ** (5 * loadavg) ~= .1 * * The system computes decay as: * decay = (2 * loadavg) / (2 * loadavg + 1) * * We wish to prove that the system's computation of decay * will always fulfill the equation: * decay ** (5 * loadavg) ~= .1 * * If we compute b as: * b = 2 * loadavg * then * decay = b / (b + 1) * * We now need to prove two things: * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) * * Facts: * For x close to zero, exp(x) =~ 1 + x, since * exp(x) = 0! + x**1/1! + x**2/2! + ... . * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. * For x close to zero, ln(1+x) =~ x, since * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). * ln(.1) =~ -2.30 * * Proof of (1): * Solve (factor)**(power) =~ .1 given power (5*loadav): * solving for factor, * ln(factor) =~ (-2.30/5*loadav), or * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED * * Proof of (2): * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): * solving for power, * power*ln(b/(b+1)) =~ -2.30, or * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED * * Actual power values for the implemented algorithm are as follows: * loadav: 1 2 3 4 * power: 5.68 10.32 14.94 19.55 */ /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ #define loadfactor(loadav) (2 * (loadav)) #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) /* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); /* * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). * * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). * * If you don't want to bother with the faster/more-accurate formula, you * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate * (more general) method of calculating the %age of CPU used by a process. */ #define CCPU_SHIFT 11 /* * Recompute process priorities, every hz ticks. * MP-safe, called without the Giant mutex. */ /* ARGSUSED */ static void schedcpu(void *arg) { register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); struct thread *td; struct proc *p; struct kse *ke; struct ksegrp *kg; int awake, realstathz; realstathz = stathz ? stathz : hz; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { /* * Prevent state changes and protect run queue. */ mtx_lock_spin(&sched_lock); /* * Increment time in/out of memory. We ignore overflow; with * 16-bit int's (remember them?) overflow takes 45 days. */ p->p_swtime++; FOREACH_KSEGRP_IN_PROC(p, kg) { awake = 0; FOREACH_KSE_IN_GROUP(kg, ke) { /* * Increment sleep time (if sleeping). We * ignore overflow, as above. */ /* * The kse slptimes are not touched in wakeup * because the thread may not HAVE a KSE. */ if (ke->ke_state == KES_ONRUNQ) { awake = 1; ke->ke_flags &= ~KEF_DIDRUN; } else if ((ke->ke_state == KES_THREAD) && (TD_IS_RUNNING(ke->ke_thread))) { awake = 1; /* Do not clear KEF_DIDRUN */ } else if (ke->ke_flags & KEF_DIDRUN) { awake = 1; ke->ke_flags &= ~KEF_DIDRUN; } /* * ke_pctcpu is only for ps and ttyinfo(). * Do it per kse, and add them up at the end? * XXXKSE */ ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >> FSHIFT; /* * If the kse has been idle the entire second, * stop recalculating its priority until * it wakes up. */ if (ke->ke_sched->ske_cpticks == 0) continue; #if (FSHIFT >= CCPU_SHIFT) ke->ke_pctcpu += (realstathz == 100) ? ((fixpt_t) ke->ke_sched->ske_cpticks) << (FSHIFT - CCPU_SHIFT) : 100 * (((fixpt_t) ke->ke_sched->ske_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else ke->ke_pctcpu += ((FSCALE - ccpu) * (ke->ke_sched->ske_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif ke->ke_sched->ske_cpticks = 0; } /* end of kse loop */ /* * If there are ANY running threads in this KSEGRP, * then don't count it as sleeping. */ if (awake) { if (kg->kg_slptime > 1) { /* * In an ideal world, this should not * happen, because whoever woke us * up from the long sleep should have * unwound the slptime and reset our * priority before we run at the stale * priority. Should KASSERT at some * point when all the cases are fixed. */ updatepri(kg); } kg->kg_slptime = 0; } else kg->kg_slptime++; if (kg->kg_slptime > 1) continue; kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu); resetpriority(kg); FOREACH_THREAD_IN_GROUP(kg, td) { if (td->td_priority >= PUSER) { sched_prio(td, kg->kg_user_pri); } } } /* end of ksegrp loop */ mtx_unlock_spin(&sched_lock); } /* end of process loop */ sx_sunlock(&allproc_lock); callout_reset(&schedcpu_callout, hz, schedcpu, NULL); } /* * Recalculate the priority of a process after it has slept for a while. * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at * least six times the loadfactor will decay kg_estcpu to zero. */ static void updatepri(struct ksegrp *kg) { register fixpt_t loadfac; register unsigned int newcpu; loadfac = loadfactor(averunnable.ldavg[0]); if (kg->kg_slptime > 5 * loadfac) kg->kg_estcpu = 0; else { newcpu = kg->kg_estcpu; kg->kg_slptime--; /* was incremented in schedcpu() */ while (newcpu && --kg->kg_slptime) newcpu = decay_cpu(loadfac, newcpu); kg->kg_estcpu = newcpu; } resetpriority(kg); } /* * Compute the priority of a process when running in user mode. * Arrange to reschedule if the resulting priority is better * than that of the current process. */ static void resetpriority(struct ksegrp *kg) { register unsigned int newpriority; struct thread *td; if (kg->kg_pri_class == PRI_TIMESHARE) { newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT + NICE_WEIGHT * (kg->kg_nice - PRIO_MIN); newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), PRI_MAX_TIMESHARE); kg->kg_user_pri = newpriority; } FOREACH_THREAD_IN_GROUP(kg, td) { maybe_resched(td); /* XXXKSE silly */ } } /* ARGSUSED */ static void sched_setup(void *dummy) { if (sched_quantum == 0) sched_quantum = SCHED_QUANTUM; hogticks = 2 * sched_quantum; callout_init(&schedcpu_callout, CALLOUT_MPSAFE); callout_init(&roundrobin_callout, 0); /* Kick off timeout driven events by calling first time. */ roundrobin(NULL); schedcpu(NULL); } /* External interfaces start here */ int sched_runnable(void) { return runq_check(&runq); } int sched_rr_interval(void) { if (sched_quantum == 0) sched_quantum = SCHED_QUANTUM; return (sched_quantum); } /* * We adjust the priority of the current process. The priority of * a process gets worse as it accumulates CPU time. The cpu usage * estimator (kg_estcpu) is increased here. resetpriority() will * compute a different priority each time kg_estcpu increases by * INVERSE_ESTCPU_WEIGHT * (until MAXPRI is reached). The cpu usage estimator ramps up * quite quickly when the process is running (linearly), and decays * away exponentially, at a rate which is proportionally slower when * the system is busy. The basic principle is that the system will * 90% forget that the process used a lot of CPU time in 5 * loadav * seconds. This causes the system to favor processes which haven't * run much recently, and to round-robin among other processes. */ void sched_clock(struct thread *td) { struct ksegrp *kg; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); kg = td->td_ksegrp; ke = td->td_kse; ke->ke_sched->ske_cpticks++; kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1); if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { resetpriority(kg); if (td->td_priority >= PUSER) td->td_priority = kg->kg_user_pri; } } /* * charge childs scheduling cpu usage to parent. * * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp. * Charge it to the ksegrp that did the wait since process estcpu is sum of * all ksegrps, this is strictly as expected. Assume that the child process * aggregated all the estcpu into the 'built-in' ksegrp. */ void sched_exit(struct proc *p, struct proc *p1) { sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_exit_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_exit_kse(struct kse *ke, struct kse *child) { } void sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) { mtx_assert(&sched_lock, MA_OWNED); kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + child->kg_estcpu); } void sched_exit_thread(struct thread *td, struct thread *child) { } void sched_fork(struct proc *p, struct proc *p1) { sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_fork_kse(struct kse *ke, struct kse *child) { child->ke_sched->ske_cpticks = 0; } void sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) { mtx_assert(&sched_lock, MA_OWNED); child->kg_estcpu = kg->kg_estcpu; } void sched_fork_thread(struct thread *td, struct thread *child) { } void sched_nice(struct ksegrp *kg, int nice) { PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); kg->kg_nice = nice; resetpriority(kg); } void sched_class(struct ksegrp *kg, int class) { mtx_assert(&sched_lock, MA_OWNED); kg->kg_pri_class = class; } /* * Adjust the priority of a thread. * This may include moving the thread within the KSEGRP, * changing the assignment of a kse to the thread, * and moving a KSE in the system run queue. */ void sched_prio(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); if (TD_ON_RUNQ(td)) { adjustrunqueue(td, prio); } else { td->td_priority = prio; } } void sched_sleep(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); td->td_ksegrp->kg_slptime = 0; td->td_priority = prio; } void sched_switch(struct thread *td) { struct thread *newtd; - u_long sched_nest; struct kse *ke; struct proc *p; ke = td->td_kse; p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_state == KES_THREAD), ("mi_switch: kse state?")); td->td_lastcpu = td->td_oncpu; td->td_last_kse = ke; td->td_oncpu = NOCPU; td->td_flags &= ~TDF_NEEDRESCHED; /* * At the last moment, if this thread is still marked RUNNING, * then put it back on the run queue as it has not been suspended * or stopped or any thing else similar. */ if (TD_IS_RUNNING(td)) { /* Put us back on the run queue (kse and all). */ setrunqueue(td); } else if (p->p_flag & P_SA) { /* * We will not be on the run queue. So we must be * sleeping or similar. As it's available, * someone else can use the KSE if they need it. */ kse_reassign(ke); } - sched_nest = sched_lock.mtx_recurse; newtd = choosethread(); if (td != newtd) cpu_switch(td, newtd); - sched_lock.mtx_recurse = sched_nest; sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); } void sched_wakeup(struct thread *td) { struct ksegrp *kg; mtx_assert(&sched_lock, MA_OWNED); kg = td->td_ksegrp; if (kg->kg_slptime > 1) updatepri(kg); kg->kg_slptime = 0; setrunqueue(td); maybe_resched(td); } void sched_add(struct thread *td) { struct kse *ke; ke = td->td_kse; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("runq_add: No KSE on thread")); KASSERT(ke->ke_state != KES_ONRUNQ, ("runq_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_add: process swapped out")); ke->ke_ksegrp->kg_runq_kses++; ke->ke_state = KES_ONRUNQ; runq_add(&runq, ke); } void sched_rem(struct thread *td) { struct kse *ke; ke = td->td_kse; KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_remove: process swapped out")); KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); mtx_assert(&sched_lock, MA_OWNED); runq_remove(&runq, ke); ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; } struct kse * sched_choose(void) { struct kse *ke; ke = runq_choose(&runq); if (ke != NULL) { runq_remove(&runq, ke); ke->ke_state = KES_THREAD; KASSERT((ke->ke_thread != NULL), ("runq_choose: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("runq_choose: No KSE on thread")); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_choose: process swapped out")); } return (ke); } void sched_userret(struct thread *td) { struct ksegrp *kg; /* * XXX we cheat slightly on the locking here to avoid locking in * the usual case. Setting td_priority here is essentially an * incomplete workaround for not setting it properly elsewhere. * Now that some interrupt handlers are threads, not setting it * properly elsewhere can clobber it in the window between setting * it here and returning to user mode, so don't waste time setting * it perfectly here. */ kg = td->td_ksegrp; if (td->td_priority != kg->kg_user_pri) { mtx_lock_spin(&sched_lock); td->td_priority = kg->kg_user_pri; mtx_unlock_spin(&sched_lock); } } int sched_sizeof_kse(void) { return (sizeof(struct kse) + sizeof(struct ke_sched)); } int sched_sizeof_ksegrp(void) { return (sizeof(struct ksegrp)); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread)); } fixpt_t sched_pctcpu(struct thread *td) { struct kse *ke; ke = td->td_kse; if (ke) return (ke->ke_pctcpu); return (0); } Index: head/sys/kern/sched_ule.c =================================================================== --- head/sys/kern/sched_ule.c (revision 121681) +++ head/sys/kern/sched_ule.c (revision 121682) @@ -1,1347 +1,1344 @@ /*- * Copyright (c) 2002-2003, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KTRACE #include #include #endif #include #define KTR_ULE KTR_NFS /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ /* XXX This is bogus compatability crap for ps */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); static int sched_strict; SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); static int slice_min = 1; SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); static int slice_max = 10; SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); int realstathz; int tickincr = 1; #ifdef SMP /* Callout to handle load balancing SMP systems. */ static struct callout kseq_lb_callout; #endif /* * These datastructures are allocated within their parent datastructure but * are scheduler specific. */ struct ke_sched { int ske_slice; struct runq *ske_runq; /* The following variables are only used for pctcpu calculation */ int ske_ltick; /* Last tick that we were running on */ int ske_ftick; /* First tick that we were running on */ int ske_ticks; /* Tick count */ /* CPU that we have affinity for. */ u_char ske_cpu; }; #define ke_slice ke_sched->ske_slice #define ke_runq ke_sched->ske_runq #define ke_ltick ke_sched->ske_ltick #define ke_ftick ke_sched->ske_ftick #define ke_ticks ke_sched->ske_ticks #define ke_cpu ke_sched->ske_cpu struct kg_sched { int skg_slptime; /* Number of ticks we vol. slept */ int skg_runtime; /* Number of ticks we were running */ }; #define kg_slptime kg_sched->skg_slptime #define kg_runtime kg_sched->skg_runtime struct td_sched { int std_slptime; }; #define td_slptime td_sched->std_slptime struct td_sched td_sched; struct ke_sched ke_sched; struct kg_sched kg_sched; struct ke_sched *kse0_sched = &ke_sched; struct kg_sched *ksegrp0_sched = &kg_sched; struct p_sched *proc0_sched = NULL; struct td_sched *thread0_sched = &td_sched; /* * The priority is primarily determined by the interactivity score. Thus, we * give lower(better) priorities to kse groups that use less CPU. The nice * value is then directly added to this to allow nice to have some effect * on latency. * * PRI_RANGE: Total priority range for timeshare threads. * PRI_NRESV: Number of nice values. * PRI_BASE: The start of the dynamic range. */ #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define SCHED_PRI_NRESV PRIO_TOTAL #define SCHED_PRI_NHALF (PRIO_TOTAL / 2) #define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) #define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) #define SCHED_PRI_INTERACT(score) \ ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) /* * These determine the interactivity of a process. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_THROTTLE: Divisor for reducing slp/run time at fork time. * INTERACT_MAX: Maximum interactivity value. Smaller is better. * INTERACT_THRESH: Threshhold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz * 5) << 10) #define SCHED_SLP_RUN_THROTTLE (100) #define SCHED_INTERACT_MAX (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) #define SCHED_INTERACT_THRESH (30) /* * These parameters and macros determine the size of the time slice that is * granted to each thread. * * SLICE_MIN: Minimum time slice granted, in units of ticks. * SLICE_MAX: Maximum time slice granted. * SLICE_RANGE: Range of available time slices scaled by hz. * SLICE_SCALE: The number slices granted per val in the range of [0, max]. * SLICE_NICE: Determine the amount of slice granted to a scaled nice. */ #define SCHED_SLICE_MIN (slice_min) #define SCHED_SLICE_MAX (slice_max) #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) #define SCHED_SLICE_NICE(nice) \ (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) /* * This macro determines whether or not the kse belongs on the current or * next run queue. * * XXX nice value should effect how interactive a kg is. */ #define SCHED_INTERACTIVE(kg) \ (sched_interact_score(kg) < SCHED_INTERACT_THRESH) #define SCHED_CURR(kg, ke) \ (ke->ke_thread->td_priority != kg->kg_user_pri || \ SCHED_INTERACTIVE(kg)) /* * Cpu percentage computation macros and defines. * * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. */ #define SCHED_CPU_TIME 10 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) /* * kseq - per processor runqs and statistics. */ #define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ struct kseq { struct runq ksq_idle; /* Queue of IDLE threads. */ struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ struct runq *ksq_next; /* Next timeshare queue. */ struct runq *ksq_curr; /* Current queue. */ int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ int ksq_load; /* Aggregate load. */ short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ short ksq_nicemin; /* Least nice. */ #ifdef SMP int ksq_cpus; /* Count of CPUs in this kseq. */ unsigned int ksq_rslices; /* Slices on run queue */ #endif }; /* * One kse queue per processor. */ #ifdef SMP struct kseq kseq_cpu[MAXCPU]; struct kseq *kseq_idmap[MAXCPU]; #define KSEQ_SELF() (kseq_idmap[PCPU_GET(cpuid)]) #define KSEQ_CPU(x) (kseq_idmap[(x)]) #else struct kseq kseq_cpu; #define KSEQ_SELF() (&kseq_cpu) #define KSEQ_CPU(x) (&kseq_cpu) #endif static void sched_slice(struct kse *ke); static void sched_priority(struct ksegrp *kg); static int sched_interact_score(struct ksegrp *kg); static void sched_interact_update(struct ksegrp *kg); void sched_pctcpu_update(struct kse *ke); int sched_pickcpu(void); /* Operations on per processor queues */ static struct kse * kseq_choose(struct kseq *kseq, int steal); static void kseq_setup(struct kseq *kseq); static void kseq_add(struct kseq *kseq, struct kse *ke); static void kseq_rem(struct kseq *kseq, struct kse *ke); static void kseq_nice_add(struct kseq *kseq, int nice); static void kseq_nice_rem(struct kseq *kseq, int nice); void kseq_print(int cpu); #ifdef SMP struct kseq * kseq_load_highest(void); void kseq_balance(void *arg); void kseq_move(struct kseq *from, int cpu); #endif void kseq_print(int cpu) { struct kseq *kseq; int i; kseq = KSEQ_CPU(cpu); printf("kseq:\n"); printf("\tload: %d\n", kseq->ksq_load); printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); printf("\tnice counts:\n"); for (i = 0; i < PRIO_TOTAL + 1; i++) if (kseq->ksq_nice[i]) printf("\t\t%d = %d\n", i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); } static void kseq_add(struct kseq *kseq, struct kse *ke) { mtx_assert(&sched_lock, MA_OWNED); kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; kseq->ksq_load++; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); #ifdef SMP kseq->ksq_rslices += ke->ke_slice; #endif } static void kseq_rem(struct kseq *kseq, struct kse *ke) { mtx_assert(&sched_lock, MA_OWNED); kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; kseq->ksq_load--; ke->ke_runq = NULL; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); #ifdef SMP kseq->ksq_rslices -= ke->ke_slice; #endif } static void kseq_nice_add(struct kseq *kseq, int nice) { mtx_assert(&sched_lock, MA_OWNED); /* Normalize to zero. */ kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) kseq->ksq_nicemin = nice; } static void kseq_nice_rem(struct kseq *kseq, int nice) { int n; mtx_assert(&sched_lock, MA_OWNED); /* Normalize to zero. */ n = nice + SCHED_PRI_NHALF; kseq->ksq_nice[n]--; KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); /* * If this wasn't the smallest nice value or there are more in * this bucket we can just return. Otherwise we have to recalculate * the smallest nice. */ if (nice != kseq->ksq_nicemin || kseq->ksq_nice[n] != 0 || kseq->ksq_loads[PRI_TIMESHARE] == 0) return; for (; n < SCHED_PRI_NRESV + 1; n++) if (kseq->ksq_nice[n]) { kseq->ksq_nicemin = n - SCHED_PRI_NHALF; return; } } #ifdef SMP /* * kseq_balance is a simple CPU load balancing algorithm. It operates by * finding the least loaded and most loaded cpu and equalizing their load * by migrating some processes. * * Dealing only with two CPUs at a time has two advantages. Firstly, most * installations will only have 2 cpus. Secondly, load balancing too much at * once can have an unpleasant effect on the system. The scheduler rarely has * enough information to make perfect decisions. So this algorithm chooses * algorithm simplicity and more gradual effects on load in larger systems. * * It could be improved by considering the priorities and slices assigned to * each task prior to balancing them. There are many pathological cases with * any approach and so the semi random algorithm below may work as well as any. * */ void kseq_balance(void *arg) { struct kseq *kseq; int high_load; int low_load; int high_cpu; int low_cpu; int move; int diff; int i; high_cpu = 0; low_cpu = 0; high_load = 0; low_load = -1; mtx_lock_spin(&sched_lock); if (smp_started == 0) goto out; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load > high_load) { high_load = kseq->ksq_load; high_cpu = i; } if (low_load == -1 || kseq->ksq_load < low_load) { low_load = kseq->ksq_load; low_cpu = i; } } kseq = KSEQ_CPU(high_cpu); /* * Nothing to do. */ if (high_load < kseq->ksq_cpus + 1) goto out; high_load -= kseq->ksq_cpus; if (low_load >= high_load) goto out; diff = high_load - low_load; move = diff / 2; if (diff & 0x1) move++; for (i = 0; i < move; i++) kseq_move(kseq, low_cpu); out: mtx_unlock_spin(&sched_lock); callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); return; } struct kseq * kseq_load_highest(void) { struct kseq *kseq; int load; int cpu; int i; mtx_assert(&sched_lock, MA_OWNED); cpu = 0; load = 0; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load > load) { load = kseq->ksq_load; cpu = i; } } kseq = KSEQ_CPU(cpu); if (load > kseq->ksq_cpus) return (kseq); return (NULL); } void kseq_move(struct kseq *from, int cpu) { struct kse *ke; ke = kseq_choose(from, 1); runq_remove(ke->ke_runq, ke); ke->ke_state = KES_THREAD; kseq_rem(from, ke); ke->ke_cpu = cpu; sched_add(ke->ke_thread); } #endif /* * Pick the highest priority task we have and return it. If steal is 1 we * will return kses that have been denied slices due to their nice being too * low. In the future we should prohibit stealing interrupt threads as well. */ struct kse * kseq_choose(struct kseq *kseq, int steal) { struct kse *ke; struct runq *swap; mtx_assert(&sched_lock, MA_OWNED); swap = NULL; for (;;) { ke = runq_choose(kseq->ksq_curr); if (ke == NULL) { /* * We already swaped once and didn't get anywhere. */ if (swap) break; swap = kseq->ksq_curr; kseq->ksq_curr = kseq->ksq_next; kseq->ksq_next = swap; continue; } /* * If we encounter a slice of 0 the kse is in a * TIMESHARE kse group and its nice was too far out * of the range that receives slices. */ if (ke->ke_slice == 0 && steal == 0) { runq_remove(ke->ke_runq, ke); sched_slice(ke); ke->ke_runq = kseq->ksq_next; runq_add(ke->ke_runq, ke); continue; } return (ke); } return (runq_choose(&kseq->ksq_idle)); } static void kseq_setup(struct kseq *kseq) { runq_init(&kseq->ksq_timeshare[0]); runq_init(&kseq->ksq_timeshare[1]); runq_init(&kseq->ksq_idle); kseq->ksq_curr = &kseq->ksq_timeshare[0]; kseq->ksq_next = &kseq->ksq_timeshare[1]; kseq->ksq_loads[PRI_ITHD] = 0; kseq->ksq_loads[PRI_REALTIME] = 0; kseq->ksq_loads[PRI_TIMESHARE] = 0; kseq->ksq_loads[PRI_IDLE] = 0; kseq->ksq_load = 0; #ifdef SMP kseq->ksq_rslices = 0; #endif } static void sched_setup(void *dummy) { #ifdef SMP int i; #endif slice_min = (hz/100); /* 10ms */ slice_max = (hz/7); /* ~140ms */ #ifdef SMP /* init kseqs */ /* Create the idmap. */ #ifdef ULE_HTT_EXPERIMENTAL if (smp_topology == NULL) { #else if (1) { #endif for (i = 0; i < MAXCPU; i++) { kseq_setup(&kseq_cpu[i]); kseq_idmap[i] = &kseq_cpu[i]; kseq_cpu[i].ksq_cpus = 1; } } else { int j; for (i = 0; i < smp_topology->ct_count; i++) { struct cpu_group *cg; cg = &smp_topology->ct_group[i]; kseq_setup(&kseq_cpu[i]); for (j = 0; j < MAXCPU; j++) if ((cg->cg_mask & (1 << j)) != 0) kseq_idmap[j] = &kseq_cpu[i]; kseq_cpu[i].ksq_cpus = cg->cg_count; } } callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); kseq_balance(NULL); #else kseq_setup(KSEQ_SELF()); #endif mtx_lock_spin(&sched_lock); kseq_add(KSEQ_SELF(), &kse0); mtx_unlock_spin(&sched_lock); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct ksegrp *kg) { int pri; if (kg->kg_pri_class != PRI_TIMESHARE) return; pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); pri += SCHED_PRI_BASE; pri += kg->kg_nice; if (pri > PRI_MAX_TIMESHARE) pri = PRI_MAX_TIMESHARE; else if (pri < PRI_MIN_TIMESHARE) pri = PRI_MIN_TIMESHARE; kg->kg_user_pri = pri; return; } /* * Calculate a time slice based on the properties of the kseg and the runq * that we're on. This is only for PRI_TIMESHARE ksegrps. */ static void sched_slice(struct kse *ke) { struct kseq *kseq; struct ksegrp *kg; kg = ke->ke_ksegrp; kseq = KSEQ_CPU(ke->ke_cpu); /* * Rationale: * KSEs in interactive ksegs get the minimum slice so that we * quickly notice if it abuses its advantage. * * KSEs in non-interactive ksegs are assigned a slice that is * based on the ksegs nice value relative to the least nice kseg * on the run queue for this cpu. * * If the KSE is less nice than all others it gets the maximum * slice and other KSEs will adjust their slice relative to * this when they first expire. * * There is 20 point window that starts relative to the least * nice kse on the run queue. Slice size is determined by * the kse distance from the last nice ksegrp. * * If you are outside of the window you will get no slice and * you will be reevaluated each time you are selected on the * run queue. * */ if (!SCHED_INTERACTIVE(kg)) { int nice; nice = kg->kg_nice + (0 - kseq->ksq_nicemin); if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || kg->kg_nice < kseq->ksq_nicemin) ke->ke_slice = SCHED_SLICE_MAX; else if (nice <= SCHED_PRI_NTHRESH) ke->ke_slice = SCHED_SLICE_NICE(nice); else ke->ke_slice = 0; } else ke->ke_slice = SCHED_SLICE_MIN; CTR6(KTR_ULE, "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); /* * Check to see if we need to scale back the slp and run time * in the kg. This will cause us to forget old interactivity * while maintaining the current ratio. */ sched_interact_update(kg); return; } static void sched_interact_update(struct ksegrp *kg) { int ratio; if ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { ratio = ((SCHED_SLP_RUN_MAX * 15) / (kg->kg_runtime + kg->kg_slptime )); kg->kg_runtime = (kg->kg_runtime * ratio) / 16; kg->kg_slptime = (kg->kg_slptime * ratio) / 16; } } static int sched_interact_score(struct ksegrp *kg) { int div; if (kg->kg_runtime > kg->kg_slptime) { div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); } if (kg->kg_slptime > kg->kg_runtime) { div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); return (kg->kg_runtime / div); } /* * This can happen if slptime and runtime are 0. */ return (0); } /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most SCHED_SLICE_MAX. */ int sched_rr_interval(void) { return (SCHED_SLICE_MAX); } void sched_pctcpu_update(struct kse *ke) { /* * Adjust counters and watermark for pctcpu calc. */ if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { /* * Shift the tick count out so that the divide doesn't * round away our results. */ ke->ke_ticks <<= 10; ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * SCHED_CPU_TICKS; ke->ke_ticks >>= 10; } else ke->ke_ticks = 0; ke->ke_ltick = ticks; ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; } #ifdef SMP /* XXX Should be changed to kseq_load_lowest() */ int sched_pickcpu(void) { struct kseq *kseq; int load; int cpu; int i; mtx_assert(&sched_lock, MA_OWNED); if (!smp_started) return (0); load = 0; cpu = 0; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load < load) { cpu = i; load = kseq->ksq_load; } } CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); return (cpu); } #else int sched_pickcpu(void) { return (0); } #endif void sched_prio(struct thread *td, u_char prio) { struct kse *ke; ke = td->td_kse; mtx_assert(&sched_lock, MA_OWNED); if (TD_ON_RUNQ(td)) { /* * If the priority has been elevated due to priority * propagation, we may have to move ourselves to a new * queue. We still call adjustrunqueue below in case kse * needs to fix things up. */ if (ke && ((td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && prio < td->td_ksegrp->kg_user_pri) || (td->td_ksegrp->kg_pri_class == PRI_IDLE && prio < PRI_MIN_IDLE))) { runq_remove(ke->ke_runq, ke); ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; runq_add(ke->ke_runq, ke); } adjustrunqueue(td, prio); } else td->td_priority = prio; } void sched_switch(struct thread *td) { struct thread *newtd; - u_int sched_nest; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; td->td_last_kse = ke; td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; td->td_flags &= ~TDF_NEEDRESCHED; if (TD_IS_RUNNING(td)) { if (td->td_proc->p_flag & P_SA) { kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); setrunqueue(td); } else { /* * This queue is always correct except for idle threads * which have a higher priority due to priority * propagation. */ if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) { if (td->td_priority < PRI_MIN_IDLE) ke->ke_runq = KSEQ_SELF()->ksq_curr; else ke->ke_runq = &KSEQ_SELF()->ksq_idle; } runq_add(ke->ke_runq, ke); /* setrunqueue(td); */ } } else { if (ke->ke_runq) kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); /* * We will not be on the run queue. So we must be * sleeping or similar. */ if (td->td_proc->p_flag & P_SA) kse_reassign(ke); } - sched_nest = sched_lock.mtx_recurse; newtd = choosethread(); if (td != newtd) cpu_switch(td, newtd); - sched_lock.mtx_recurse = sched_nest; sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); } void sched_nice(struct ksegrp *kg, int nice) { struct kse *ke; struct thread *td; struct kseq *kseq; PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); /* * We need to adjust the nice counts for running KSEs. */ if (kg->kg_pri_class == PRI_TIMESHARE) FOREACH_KSE_IN_GROUP(kg, ke) { if (ke->ke_runq == NULL) continue; kseq = KSEQ_CPU(ke->ke_cpu); kseq_nice_rem(kseq, kg->kg_nice); kseq_nice_add(kseq, nice); } kg->kg_nice = nice; sched_priority(kg); FOREACH_THREAD_IN_GROUP(kg, td) td->td_flags |= TDF_NEEDRESCHED; } void sched_sleep(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); td->td_slptime = ticks; td->td_priority = prio; CTR2(KTR_ULE, "sleep kse %p (tick: %d)", td->td_kse, td->td_slptime); } void sched_wakeup(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); /* * Let the kseg know how long we slept for. This is because process * interactivity behavior is modeled in the kseg. */ if (td->td_slptime) { struct ksegrp *kg; int hzticks; kg = td->td_ksegrp; hzticks = ticks - td->td_slptime; kg->kg_slptime += hzticks << 10; sched_interact_update(kg); sched_priority(kg); if (td->td_kse) sched_slice(td->td_kse); CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", td->td_kse, hzticks); td->td_slptime = 0; } setrunqueue(td); if (td->td_priority < curthread->td_priority) curthread->td_flags |= TDF_NEEDRESCHED; } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct proc *p, struct proc *p1) { mtx_assert(&sched_lock, MA_OWNED); sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_fork_kse(struct kse *ke, struct kse *child) { child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ child->ke_runq = NULL; /* Grab our parents cpu estimation information. */ child->ke_ticks = ke->ke_ticks; child->ke_ltick = ke->ke_ltick; child->ke_ftick = ke->ke_ftick; } void sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) { PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); /* XXX Need something better here */ child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE; child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE; kg->kg_runtime += tickincr << 10; sched_interact_update(kg); child->kg_user_pri = kg->kg_user_pri; child->kg_nice = kg->kg_nice; } void sched_fork_thread(struct thread *td, struct thread *child) { } void sched_class(struct ksegrp *kg, int class) { struct kseq *kseq; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); if (kg->kg_pri_class == class) return; FOREACH_KSE_IN_GROUP(kg, ke) { if (ke->ke_state != KES_ONRUNQ && ke->ke_state != KES_THREAD) continue; kseq = KSEQ_CPU(ke->ke_cpu); kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; kseq->ksq_loads[PRI_BASE(class)]++; if (kg->kg_pri_class == PRI_TIMESHARE) kseq_nice_rem(kseq, kg->kg_nice); else if (class == PRI_TIMESHARE) kseq_nice_add(kseq, kg->kg_nice); } kg->kg_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct proc *child) { /* XXX Need something better here */ mtx_assert(&sched_lock, MA_OWNED); sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); } void sched_exit_kse(struct kse *ke, struct kse *child) { kseq_rem(KSEQ_CPU(child->ke_cpu), child); } void sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) { /* kg->kg_slptime += child->kg_slptime; */ kg->kg_runtime += child->kg_runtime; sched_interact_update(kg); } void sched_exit_thread(struct thread *td, struct thread *child) { } void sched_clock(struct thread *td) { struct kseq *kseq; struct ksegrp *kg; struct kse *ke; /* * sched_setup() apparently happens prior to stathz being set. We * need to resolve the timers earlier in the boot so we can avoid * calculating this here. */ if (realstathz == 0) { realstathz = stathz ? stathz : hz; tickincr = hz / realstathz; /* * XXX This does not work for values of stathz that are much * larger than hz. */ if (tickincr == 0) tickincr = 1; } ke = td->td_kse; kg = ke->ke_ksegrp; mtx_assert(&sched_lock, MA_OWNED); KASSERT((td != NULL), ("schedclock: null thread pointer")); /* Adjust ticks for pctcpu */ ke->ke_ticks++; ke->ke_ltick = ticks; /* Go up to one second beyond our max and then trim back down */ if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) sched_pctcpu_update(ke); if (td->td_flags & TDF_IDLETD) return; CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); /* * We only do slicing code for TIMESHARE ksegrps. */ if (kg->kg_pri_class != PRI_TIMESHARE) return; /* * We used a tick charge it to the ksegrp so that we can compute our * interactivity. */ kg->kg_runtime += tickincr << 10; sched_interact_update(kg); /* * We used up one time slice. */ ke->ke_slice--; kseq = KSEQ_SELF(); #ifdef SMP kseq->ksq_rslices--; #endif if (ke->ke_slice > 0) return; /* * We're out of time, recompute priorities and requeue. */ kseq_rem(kseq, ke); sched_priority(kg); sched_slice(ke); if (SCHED_CURR(kg, ke)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; kseq_add(kseq, ke); td->td_flags |= TDF_NEEDRESCHED; } int sched_runnable(void) { struct kseq *kseq; int load; load = 1; mtx_lock_spin(&sched_lock); kseq = KSEQ_SELF(); if ((curthread->td_flags & TDF_IDLETD) != 0) { if (kseq->ksq_load > 0) goto out; } else if (kseq->ksq_load - 1 > 0) goto out; #ifdef SMP /* * For SMP we may steal other processor's KSEs. Just search until we * verify that at least on other cpu has a runnable task. */ if (smp_started) { int i; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load > kseq->ksq_cpus) goto out; } } #endif load = 0; out: mtx_unlock_spin(&sched_lock); return (load); } void sched_userret(struct thread *td) { struct ksegrp *kg; kg = td->td_ksegrp; if (td->td_priority != kg->kg_user_pri) { mtx_lock_spin(&sched_lock); td->td_priority = kg->kg_user_pri; mtx_unlock_spin(&sched_lock); } } struct kse * sched_choose(void) { struct kseq *kseq; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); #ifdef SMP retry: #endif kseq = KSEQ_SELF(); ke = kseq_choose(kseq, 0); if (ke) { runq_remove(ke->ke_runq, ke); ke->ke_state = KES_THREAD; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority); } return (ke); } #ifdef SMP if (smp_started) { /* * Find the cpu with the highest load and steal one proc. */ if ((kseq = kseq_load_highest()) == NULL) return (NULL); /* * Remove this kse from this kseq and runq and then requeue * on the current processor. Then we will dequeue it * normally above. */ kseq_move(kseq, PCPU_GET(cpuid)); goto retry; } #endif return (NULL); } void sched_add(struct thread *td) { struct kseq *kseq; struct ksegrp *kg; struct kse *ke; ke = td->td_kse; kg = td->td_ksegrp; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("sched_add: No KSE on thread")); KASSERT(ke->ke_state != KES_ONRUNQ, ("sched_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("sched_add: process swapped out")); KASSERT(ke->ke_runq == NULL, ("sched_add: KSE %p is still assigned to a run queue", ke)); switch (PRI_BASE(kg->kg_pri_class)) { case PRI_ITHD: case PRI_REALTIME: kseq = KSEQ_SELF(); ke->ke_runq = kseq->ksq_curr; ke->ke_slice = SCHED_SLICE_MAX; ke->ke_cpu = PCPU_GET(cpuid); break; case PRI_TIMESHARE: kseq = KSEQ_CPU(ke->ke_cpu); if (SCHED_CURR(kg, ke)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; break; case PRI_IDLE: kseq = KSEQ_CPU(ke->ke_cpu); /* * This is for priority prop. */ if (ke->ke_thread->td_priority < PRI_MIN_IDLE) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = &kseq->ksq_idle; ke->ke_slice = SCHED_SLICE_MIN; break; default: panic("Unknown pri class.\n"); break; } ke->ke_ksegrp->kg_runq_kses++; ke->ke_state = KES_ONRUNQ; runq_add(ke->ke_runq, ke); kseq_add(kseq, ke); } void sched_rem(struct thread *td) { struct kseq *kseq; struct kse *ke; ke = td->td_kse; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; kseq = KSEQ_CPU(ke->ke_cpu); runq_remove(ke->ke_runq, ke); kseq_rem(kseq, ke); } fixpt_t sched_pctcpu(struct thread *td) { fixpt_t pctcpu; struct kse *ke; pctcpu = 0; ke = td->td_kse; if (ke == NULL) return (0); mtx_lock_spin(&sched_lock); if (ke->ke_ticks) { int rtick; /* * Don't update more frequently than twice a second. Allowing * this causes the cpu usage to decay away too quickly due to * rounding errors. */ if (ke->ke_ltick < (ticks - (hz / 2))) sched_pctcpu_update(ke); /* How many rtick per second ? */ rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; } ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; mtx_unlock_spin(&sched_lock); return (pctcpu); } int sched_sizeof_kse(void) { return (sizeof(struct kse) + sizeof(struct ke_sched)); } int sched_sizeof_ksegrp(void) { return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); }