Index: sched_ule.c =================================================================== --- sched_ule.c +++ sched_ule.c @@ -205,6 +205,8 @@ * realstathz: stathz is sometimes 0 and run off of hz. * sched_slice: Runtime of each thread before rescheduling. * preempt_thresh: Priority threshold for preemption and remote IPIs. + * preempt_timeshare_delta: Preempt if the difference in timeshare + * priorities exceeds this threshold. */ static int sched_interact = SCHED_INTERACT_THRESH; static int tickincr = 8 << SCHED_TICK_SHIFT; @@ -214,10 +216,13 @@ #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int preempt_thresh = PRI_MAX_IDLE; +static int preempt_timeshare_delta = 0; #else static int preempt_thresh = PRI_MIN_KERN; +static int preempt_timeshare_delta = (PRI_BATCH_RANGE - SCHED_PRI_NRESV) / 2; #endif #else +static int preempt_timeshare_delta = INT_MAX; static int preempt_thresh = 0; #endif static int static_boost = PRI_MIN_BATCH; @@ -316,7 +321,9 @@ static void tdq_load_rem(struct tdq *, struct thread *); static __inline void tdq_runq_add(struct tdq *, struct thread *, int); static __inline void tdq_runq_rem(struct tdq *, struct thread *); -static inline int sched_shouldpreempt(int, int, int); +static __inline void tdq_runq_elevate(struct tdq *, struct thread *); +static inline int sched_shouldpreempt(struct tdq *, struct thread *, + struct thread *); void tdq_print(int cpu); static void runq_print(struct runq *rq); static void tdq_add(struct tdq *, struct thread *, int); @@ -418,36 +425,85 @@ runq_print(&tdq->tdq_idle); } +/* + * Evaluate whether we should preempt a thread or simply set NEEDRESCHED. + */ static inline int -sched_shouldpreempt(int pri, int cpri, int remote) +sched_shouldpreempt(struct tdq *tdq, struct thread *td, struct thread *ctd) { + int pri, cpri; + int remote, timeshare; + u_char ridx; + /* * If the new priority is not better than the current priority there is * nothing to do. */ + pri = td->td_priority; + cpri = ctd->td_priority; if (pri >= cpri) return (0); + /* * Always preempt idle. */ if (cpri >= PRI_MIN_IDLE) return (1); + /* + * If the threads are not both on the timeshare queue + * NEEEDRESCHED is set unconditionally for the lower + * priority curthread. We will also preempt in most + * cases which will harmlessly clear the bit. + * + * The index determines run-order more strongly than + * priority for timeshare threads. We eliminate needless + * switches by filtering on run-queue order here. + */ + timeshare = td_get_sched(td)->ts_runq == &tdq->tdq_timeshare && + td_get_sched(ctd)->ts_runq == &tdq->tdq_timeshare; + ridx = tdq->tdq_ridx; + if (!timeshare || + (u_char)(td->td_rqindex - ridx) < + (u_char)(ctd->td_rqindex - ridx)) { + if (ctd->td_lock == TDQ_LOCKPTR(tdq)) + ctd->td_flags |= TDF_NEEDRESCHED; + } + + /* * If preemption is disabled don't preempt others. */ if (preempt_thresh == 0) return (0); + /* * Preempt if we exceed the threshold. */ if (pri <= preempt_thresh) return (1); + /* * If we're interactive or better and there is non-interactive - * or worse running preempt only remote processors. + * or worse running preempt remote processors. Local processors + * will honor this from NEEDRESCHED and avoid preemption. A future + * enhancement could do the same for remote. */ + remote = tdq != TDQ_SELF(); if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT) return (1); + + /* + * If the difference between the two timeshare threads priorities + * exceeds the delta threshold we elevate the new thread on the + * timeshare queue and request a resched. + */ + if (timeshare && cpri - pri > preempt_timeshare_delta) { + tdq_runq_elevate(tdq, td); + if (ctd->td_lock == TDQ_LOCKPTR(tdq)) + ctd->td_flags |= TDF_NEEDRESCHED; + return (remote); + } + return (0); } @@ -502,6 +558,21 @@ runq_add(ts->ts_runq, td, flags); } +static void +tdq_runq_elevate(struct tdq *tdq, struct thread *td) +{ + struct td_sched *ts; + + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); + + ts = td_get_sched(td); + if (ts->ts_runq == &tdq->tdq_timeshare) { + runq_remove_idx(ts->ts_runq, td, NULL); + runq_add_pri(ts->ts_runq, td, tdq->tdq_ridx, SRQ_PREEMPTED); + } +} + /* * Remove a thread from a run-queue. This typically happens when a thread * is selected to run. Running threads are not on the queue and the @@ -1075,15 +1146,14 @@ tdq_notify(struct tdq *tdq, struct thread *td) { struct thread *ctd; - int pri; int cpu; if (tdq->tdq_ipipending) return; cpu = td_get_sched(td)->ts_cpu; - pri = td->td_priority; ctd = pcpu_find(cpu)->pc_curthread; - if (!sched_shouldpreempt(pri, ctd->td_priority, 1)) + + if (!sched_shouldpreempt(tdq, td, ctd)) return; /* @@ -1567,8 +1637,8 @@ score = imax(0, sched_interact_score(td) + td->td_proc->p_nice); if (score < sched_interact) { pri = PRI_MIN_INTERACT; - pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) / - sched_interact) * score; + pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT) * score) / + sched_interact; KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT, ("sched_priority: invalid interactive priority %d score %d", pri, score)); @@ -2173,7 +2243,7 @@ return; if (static_boost == 1 && prio) sched_prio(td, prio); - else if (static_boost && td->td_priority > static_boost) + else if (static_boost > 1 && td->td_priority > static_boost) sched_prio(td, static_boost); } @@ -2330,6 +2400,7 @@ sched_preempt(struct thread *td) { struct tdq *tdq; + struct thread *ntd; SDT_PROBE2(sched, , , surrender, td, td->td_proc); @@ -2337,7 +2408,14 @@ tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); tdq->tdq_ipipending = 0; - if (td->td_priority > tdq->tdq_lowpri) { + + /* + * The state could've changed since the remote processor signaled + * or it may have simply signaled to trigger NEEDRESCHED. We + * filter again here before preempting. + */ + ntd = tdq_choose(tdq); + if (ntd != NULL && sched_shouldpreempt(tdq, ntd, td)) { int flags; flags = SW_INVOL | SW_PREEMPT; @@ -2489,19 +2567,13 @@ sched_setpreempt(struct thread *td) { struct thread *ctd; - int cpri; - int pri; THREAD_LOCK_ASSERT(curthread, MA_OWNED); ctd = curthread; - pri = td->td_priority; - cpri = ctd->td_priority; - if (pri < cpri) - ctd->td_flags |= TDF_NEEDRESCHED; - if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) + if (panicstr != NULL || cold || TD_IS_INHIBITED(ctd)) return; - if (!sched_shouldpreempt(pri, cpri, 0)) + if (!sched_shouldpreempt(TDQ_SELF(), td, ctd)) return; ctd->td_owepreempt = 1; } @@ -3044,8 +3116,12 @@ SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 0, "Maximal (lowest) priority for preemption"); +SYSCTL_INT(_kern_sched, OID_AUTO, preempt_timeshare_delta, CTLFLAG_RW, + &preempt_timeshare_delta, 0, + "Difference in timeshare priorities required for preemption"); SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0, - "Assign static kernel priorities to sleeping threads"); + "Elevate priorities of sleeping threads. " + "0 = disabled, 1 = kernel supplied value, >1 = specified priority."); SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0, "Number of times idle thread will spin waiting for new work"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW,