Changeset View
Standalone View
sys/kern/sched_ule.c
Show First 20 Lines • Show All 234 Lines • ▼ Show 20 Lines | struct tdq { | ||||
* tdq_lock is padded to avoid false sharing with tdq_load and | * tdq_lock is padded to avoid false sharing with tdq_load and | ||||
* tdq_cpu_idle. | * tdq_cpu_idle. | ||||
*/ | */ | ||||
struct mtx_padalign tdq_lock; /* run queue lock. */ | struct mtx_padalign tdq_lock; /* run queue lock. */ | ||||
struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ | struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ | ||||
volatile int tdq_load; /* Aggregate load. */ | volatile int tdq_load; /* Aggregate load. */ | ||||
volatile int tdq_cpu_idle; /* cpu_idle() is active. */ | volatile int tdq_cpu_idle; /* cpu_idle() is active. */ | ||||
int tdq_sysload; /* For loadavg, !ITHD load. */ | int tdq_sysload; /* For loadavg, !ITHD load. */ | ||||
int tdq_transferable; /* Transferable thread count. */ | volatile int tdq_transferable; /* Transferable thread count. */ | ||||
short tdq_switchcnt; /* Switches this tick. */ | volatile short tdq_switchcnt; /* Switches this tick. */ | ||||
short tdq_oldswitchcnt; /* Switches last tick. */ | volatile short tdq_oldswitchcnt; /* Switches last tick. */ | ||||
u_char tdq_lowpri; /* Lowest priority thread. */ | u_char tdq_lowpri; /* Lowest priority thread. */ | ||||
u_char tdq_ipipending; /* IPI pending. */ | u_char tdq_ipipending; /* IPI pending. */ | ||||
u_char tdq_idx; /* Current insert index. */ | u_char tdq_idx; /* Current insert index. */ | ||||
u_char tdq_ridx; /* Current removal index. */ | u_char tdq_ridx; /* Current removal index. */ | ||||
struct runq tdq_realtime; /* real-time run queue. */ | struct runq tdq_realtime; /* real-time run queue. */ | ||||
struct runq tdq_timeshare; /* timeshare run queue. */ | struct runq tdq_timeshare; /* timeshare run queue. */ | ||||
struct runq tdq_idle; /* Queue of IDLE threads. */ | struct runq tdq_idle; /* Queue of IDLE threads. */ | ||||
char tdq_name[TDQ_NAME_LEN]; | char tdq_name[TDQ_NAME_LEN]; | ||||
Show All 15 Lines | |||||
/* | /* | ||||
* Run-time tunables. | * Run-time tunables. | ||||
*/ | */ | ||||
static int rebalance = 1; | static int rebalance = 1; | ||||
static int balance_interval = 128; /* Default set in sched_initticks(). */ | static int balance_interval = 128; /* Default set in sched_initticks(). */ | ||||
static int affinity; | static int affinity; | ||||
static int steal_idle = 1; | static int steal_idle = 1; | ||||
static int steal_thresh = 2; | static int steal_thresh = 2; | ||||
static int always_steal = 0; | |||||
static int trysteal_limit = 2; | |||||
/* | /* | ||||
* One thread queue per processor. | * One thread queue per processor. | ||||
*/ | */ | ||||
static struct tdq tdq_cpu[MAXCPU]; | static struct tdq tdq_cpu[MAXCPU]; | ||||
static struct tdq *balance_tdq; | static struct tdq *balance_tdq; | ||||
static int balance_ticks; | static int balance_ticks; | ||||
static DPCPU_DEFINE(uint32_t, randomval); | static DPCPU_DEFINE(uint32_t, randomval); | ||||
Show All 29 Lines | |||||
static void tdq_load_rem(struct tdq *, struct thread *); | static void tdq_load_rem(struct tdq *, struct thread *); | ||||
static __inline void tdq_runq_add(struct tdq *, struct thread *, int); | static __inline void tdq_runq_add(struct tdq *, struct thread *, int); | ||||
static __inline void tdq_runq_rem(struct tdq *, struct thread *); | static __inline void tdq_runq_rem(struct tdq *, struct thread *); | ||||
static inline int sched_shouldpreempt(int, int, int); | static inline int sched_shouldpreempt(int, int, int); | ||||
void tdq_print(int cpu); | void tdq_print(int cpu); | ||||
static void runq_print(struct runq *rq); | static void runq_print(struct runq *rq); | ||||
static void tdq_add(struct tdq *, struct thread *, int); | static void tdq_add(struct tdq *, struct thread *, int); | ||||
#ifdef SMP | #ifdef SMP | ||||
static int tdq_move(struct tdq *, struct tdq *); | static struct thread *tdq_move(struct tdq *, struct tdq *); | ||||
static int tdq_idled(struct tdq *); | static int tdq_idled(struct tdq *); | ||||
static void tdq_notify(struct tdq *, struct thread *); | static void tdq_notify(struct tdq *, struct thread *); | ||||
static struct thread *tdq_steal(struct tdq *, int); | static struct thread *tdq_steal(struct tdq *, int); | ||||
static struct thread *runq_steal(struct runq *, int); | static struct thread *runq_steal(struct runq *, int); | ||||
static int sched_pickcpu(struct thread *, int); | static int sched_pickcpu(struct thread *, int); | ||||
static void sched_balance(void); | static void sched_balance(void); | ||||
static int sched_balance_pair(struct tdq *, struct tdq *); | static int sched_balance_pair(struct tdq *, struct tdq *); | ||||
static inline struct tdq *sched_setcpu(struct thread *, int, int); | static inline struct tdq *sched_setcpu(struct thread *, int, int); | ||||
▲ Show 20 Lines • Show All 399 Lines • ▼ Show 20 Lines | if (child) { /* Handle child CPU group. */ | ||||
if (tdq->tdq_lowpri > lgroup.cs_pri && | if (tdq->tdq_lowpri > lgroup.cs_pri && | ||||
tdq->tdq_load <= lgroup.cs_limit && | tdq->tdq_load <= lgroup.cs_limit && | ||||
CPU_ISSET(cpu, &lgroup.cs_mask)) { | CPU_ISSET(cpu, &lgroup.cs_mask)) { | ||||
lgroup.cs_cpu = cpu; | lgroup.cs_cpu = cpu; | ||||
lgroup.cs_load = load - rnd; | lgroup.cs_load = load - rnd; | ||||
} | } | ||||
} | } | ||||
if (match & CPU_SEARCH_HIGHEST) | if (match & CPU_SEARCH_HIGHEST) | ||||
if (tdq->tdq_load >= hgroup.cs_limit && | if (tdq->tdq_load >= hgroup.cs_limit && | ||||
truckman: I see a small increase in wall clock time due to this change and an increase in CPU %idle since… | |||||
Not Done Inline ActionsAs I describe in another inline comment the current code allows to steal a single "just assigned" thread from a CPU and that may be useful in some cases. avg: As I describe in another inline comment the current code allows to steal a single "just… | |||||
Not Done Inline ActionsAlso, if this change is to be made, then, in my opinion, the default steal threshold should be lowered to one. avg: Also, if this change is to be made, then, in my opinion, the default steal threshold should be… | |||||
Not Done Inline ActionsI plan to back out this change. truckman: I plan to back out this change. | |||||
tdq->tdq_transferable && | tdq->tdq_transferable && | ||||
CPU_ISSET(cpu, &hgroup.cs_mask)) { | CPU_ISSET(cpu, &hgroup.cs_mask)) { | ||||
hgroup.cs_cpu = cpu; | hgroup.cs_cpu = cpu; | ||||
hgroup.cs_load = load - rnd; | hgroup.cs_load = load - rnd; | ||||
} | } | ||||
} | } | ||||
total += load; | total += load; | ||||
▲ Show 20 Lines • Show All 89 Lines • ▼ Show 20 Lines | |||||
static void | static void | ||||
sched_balance_group(struct cpu_group *cg) | sched_balance_group(struct cpu_group *cg) | ||||
{ | { | ||||
cpuset_t hmask, lmask; | cpuset_t hmask, lmask; | ||||
int high, low, anylow; | int high, low, anylow; | ||||
CPU_FILL(&hmask); | CPU_FILL(&hmask); | ||||
for (;;) { | for (;;) { | ||||
high = sched_highest(cg, hmask, 1); | high = sched_highest(cg, hmask, 2); | ||||
Not Done Inline ActionsIt's being awhile, but I am not sure this is a correct change. This code is not about idle stealing threads but about equalizing the load. So anything transferrable may worth to be transferred from time to time to balance cache usage and other shared resources. mav: It's being awhile, but I am not sure this is a correct change. This code is not about idle… | |||||
Not Done Inline ActionsThe only situation where this would make a difference is if the balancer wanted to move a thread from a CPU with a load of 1 to a CPU with a load of 0. The only time that a CPU with a load of 1 would have a transferrable thread is if it was previously idle and and that thread was just assigned to it, but it hadn't yet done a context switch from the idle thread to the newly assigned thread. It is likely that the thread was assigned by sched_add(), which used sched_pickcpu() to chose the optimal CPU to assign the thread to. truckman: The only situation where this would make a difference is if the balancer wanted to move a… | |||||
Not Done Inline ActionsThe value 2 might be better than steal_thresh, but I don't think 1 is the correct value. I don't think it matters much because 2 is the the default value for steal_thresh, and nobody is likely to adjust it higher because that increases the wall clock time in all of the testing that I've done. truckman: The value 2 might be better than steal_thresh, but I don't think 1 is the correct value. I… | |||||
/* Stop if there is no more CPU with transferrable threads. */ | /* Stop if there is no more CPU with transferrable threads. */ | ||||
if (high == -1) | if (high == -1) | ||||
break; | break; | ||||
CPU_CLR(high, &hmask); | CPU_CLR(high, &hmask); | ||||
CPU_COPY(&hmask, &lmask); | CPU_COPY(&hmask, &lmask); | ||||
/* Stop if there is no more CPU left for low. */ | /* Stop if there is no more CPU left for low. */ | ||||
if (CPU_EMPTY(&lmask)) | if (CPU_EMPTY(&lmask)) | ||||
break; | break; | ||||
▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines | |||||
} | } | ||||
/* | /* | ||||
* Transfer load between two imbalanced thread queues. | * Transfer load between two imbalanced thread queues. | ||||
*/ | */ | ||||
static int | static int | ||||
sched_balance_pair(struct tdq *high, struct tdq *low) | sched_balance_pair(struct tdq *high, struct tdq *low) | ||||
{ | { | ||||
int moved; | struct thread *td; | ||||
int cpu; | int cpu; | ||||
tdq_lock_pair(high, low); | tdq_lock_pair(high, low); | ||||
moved = 0; | td = NULL; | ||||
/* | /* | ||||
* Determine what the imbalance is and then adjust that to how many | * Transfer a thread from high to low. | ||||
* threads we actually have to give up (transferable). | |||||
*/ | */ | ||||
if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load && | if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load && | ||||
(moved = tdq_move(high, low)) > 0) { | (td = tdq_move(high, low)) != NULL) { | ||||
/* | /* | ||||
* In case the target isn't the current cpu IPI it to force a | * In case the target isn't the current cpu notify it of the | ||||
* reschedule with the new workload. | * new load, possibly sending an IPI to force it to reschedule. | ||||
*/ | */ | ||||
cpu = TDQ_ID(low); | cpu = TDQ_ID(low); | ||||
if (cpu != PCPU_GET(cpuid)) | if (cpu != PCPU_GET(cpuid)) | ||||
ipi_cpu(cpu, IPI_PREEMPT); | tdq_notify(low, td); | ||||
} | } | ||||
tdq_unlock_pair(high, low); | tdq_unlock_pair(high, low); | ||||
return (moved); | return (td != NULL); | ||||
} | } | ||||
/* | /* | ||||
* Move a thread from one thread queue to another. | * Move a thread from one thread queue to another. | ||||
*/ | */ | ||||
static int | static struct thread * | ||||
tdq_move(struct tdq *from, struct tdq *to) | tdq_move(struct tdq *from, struct tdq *to) | ||||
{ | { | ||||
struct td_sched *ts; | struct td_sched *ts; | ||||
struct thread *td; | struct thread *td; | ||||
struct tdq *tdq; | struct tdq *tdq; | ||||
int cpu; | int cpu; | ||||
TDQ_LOCK_ASSERT(from, MA_OWNED); | TDQ_LOCK_ASSERT(from, MA_OWNED); | ||||
TDQ_LOCK_ASSERT(to, MA_OWNED); | TDQ_LOCK_ASSERT(to, MA_OWNED); | ||||
tdq = from; | tdq = from; | ||||
cpu = TDQ_ID(to); | cpu = TDQ_ID(to); | ||||
td = tdq_steal(tdq, cpu); | td = tdq_steal(tdq, cpu); | ||||
if (td == NULL) | if (td == NULL) | ||||
return (0); | return (NULL); | ||||
ts = td_get_sched(td); | ts = td_get_sched(td); | ||||
/* | /* | ||||
* Although the run queue is locked the thread may be blocked. Lock | * Although the run queue is locked the thread may be blocked. Lock | ||||
* it to clear this and acquire the run-queue lock. | * it to clear this and acquire the run-queue lock. | ||||
*/ | */ | ||||
thread_lock(td); | thread_lock(td); | ||||
/* Drop recursive lock on from acquired via thread_lock(). */ | /* Drop recursive lock on from acquired via thread_lock(). */ | ||||
TDQ_UNLOCK(from); | TDQ_UNLOCK(from); | ||||
sched_rem(td); | sched_rem(td); | ||||
ts->ts_cpu = cpu; | ts->ts_cpu = cpu; | ||||
td->td_lock = TDQ_LOCKPTR(to); | td->td_lock = TDQ_LOCKPTR(to); | ||||
tdq_add(to, td, SRQ_YIELDING); | tdq_add(to, td, SRQ_YIELDING); | ||||
return (1); | return (td); | ||||
} | } | ||||
/* | /* | ||||
* This tdq has idled. Try to steal a thread from another cpu and switch | * This tdq has idled. Try to steal a thread from another cpu and switch | ||||
* to it. | * to it. | ||||
*/ | */ | ||||
static int | static int | ||||
tdq_idled(struct tdq *tdq) | tdq_idled(struct tdq *tdq) | ||||
{ | { | ||||
struct cpu_group *cg; | struct cpu_group *cg; | ||||
struct tdq *steal; | struct tdq *steal; | ||||
cpuset_t mask; | cpuset_t mask; | ||||
int thresh; | int cpu, switchcnt; | ||||
int cpu; | |||||
if (smp_started == 0 || steal_idle == 0) | if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL) | ||||
return (1); | return (1); | ||||
CPU_FILL(&mask); | CPU_FILL(&mask); | ||||
CPU_CLR(PCPU_GET(cpuid), &mask); | CPU_CLR(PCPU_GET(cpuid), &mask); | ||||
/* We don't want to be preempted while we're iterating. */ | restart: | ||||
spinlock_enter(); | switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; | ||||
for (cg = tdq->tdq_cg; cg != NULL; ) { | for (cg = tdq->tdq_cg; ; ) { | ||||
if ((cg->cg_flags & CG_FLAG_THREAD) == 0) | cpu = sched_highest(cg, mask, steal_thresh); | ||||
thresh = steal_thresh; | /* | ||||
else | * We were assigned a thread but not preempted. Returning | ||||
thresh = 1; | * 0 here will cause our caller to switch to it. | ||||
Not Done Inline ActionsWhat is the intent of this code? I think I've been interpreting it backwards. On SMT hardware, do we want to make it easier or harder to steal a thread from another crippled SMT thread on the same core vs. from another core? It is a lighter weight migration due to shared cache, but in a 50% load situation it will tend to cluster threads on the same cores vs. spreading them out across cores with only one SMT thread on each core active, where they might run faster due to less resource contention. If this code was reversed, then the steal would happen on the first iteration if the load on the other SMT thread was very high. That same transfer could happen in the second iteraction if the load on that SMT thread was the highest of all the logical CPUs at that level. I've been wondering why steals from the first level of the hierarchy were far outnumbering steals from the other two levels during my Ryzen testing ... truckman: What is the intent of this code? I think I've been interpreting it backwards. On SMT hardware… | |||||
Not Done Inline Actionsthresh == 1 is strange in this version of the code (which does a tdq_load >= thresh test) because most opportunities will be filtered out by the tdq_transferrable check, but every once in a while both tdq_load and tdq_transferrable will both be 1 and the move will succeed. truckman: thresh == 1 is strange in this version of the code (which does a tdq_load >= thresh test)… | |||||
Not Done Inline ActionsI think that this code makes a small difference in a single very specific scenario. avg: I think that this code makes a small difference in a single very specific scenario.
If a… | |||||
Not Done Inline ActionsOnly a tiny bit sooner. After adding a bunch of debug code to analyze what exactly is happening, I discovered that the victim SMT thread is almost always blocked on thread_lock() in critcal_exit() at the time the thread is actually stolen. It is awake and returning from the IPI that will trigger the preemption that will cause it to switch from the the idle thread to the new thread that it was assigned, but it can't proceed because the poaching thread has called tdq_lock_pair() and grabbed its lock. truckman: Only a tiny bit sooner. After adding a bunch of debug code to analyze what exactly is… | |||||
cpu = sched_highest(cg, mask, thresh); | */ | ||||
if (tdq->tdq_load) | |||||
return (0); | |||||
if (cpu == -1) { | if (cpu == -1) { | ||||
cg = cg->cg_parent; | cg = cg->cg_parent; | ||||
if (cg == NULL) | |||||
return (1); | |||||
continue; | continue; | ||||
} | } | ||||
steal = TDQ_CPU(cpu); | steal = TDQ_CPU(cpu); | ||||
CPU_CLR(cpu, &mask); | /* | ||||
* The data returned by sched_highest() is stale and | |||||
* the chosen CPU no longer has an eligible thread. | |||||
* | |||||
* Testing this ahead of tdq_lock_pair() only catches | |||||
* this situation about 20% of the time on an 8 core | |||||
* 16 thread Ryzen 7, but it still helps performance. | |||||
*/ | |||||
if (steal->tdq_load < steal_thresh || | |||||
Not Done Inline Actionswouldn't just returning zero here do the same job (given the loop in sched_idletd) ? avg: wouldn't just returning zero here do the same job (given the loop in `sched_idletd`) ? | |||||
Not Done Inline ActionsYes, I realized that and made that change in my next version of the patch. truckman: Yes, I realized that and made that change in my next version of the patch. | |||||
steal->tdq_transferable == 0) | |||||
goto restart; | |||||
tdq_lock_pair(tdq, steal); | tdq_lock_pair(tdq, steal); | ||||
if (steal->tdq_load < thresh || steal->tdq_transferable == 0) { | /* | ||||
* We were assigned a thread while waiting for the locks. | |||||
* Switch to it now instead of stealing a thread. | |||||
*/ | |||||
if (tdq->tdq_load) | |||||
break; | |||||
/* | |||||
* The data returned by sched_highest() is stale and | |||||
* the chosen CPU no longer has an eligible thread, or | |||||
* we were preempted and the CPU loading info may be out | |||||
* of date. The latter is rare. In either case restart | |||||
* the search. | |||||
*/ | |||||
if (steal->tdq_load < steal_thresh || | |||||
steal->tdq_transferable == 0 || | |||||
switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) { | |||||
Not Done Inline ActionsThe threshold test here should be the inverse of the condition on line 736 above. If steal->tdl_load == 1 is not matched here, then I sometimes see tdq_move() succeed when steal->tdl_load == 1 && steal->tdq_transferrable != 0, which looks like we are somehow poaching the only thread assigned to that CPU. I'm not sure how that can happen other than the other CPU being idle and has somehow not picked up the thread. The frequency of occurrence seems to be independent of the machdep.idle setting. Maybe it is busy handling an interrupt ... truckman: The threshold test here should be the inverse of the condition on line 736 above.
If steal… | |||||
tdq_unlock_pair(tdq, steal); | tdq_unlock_pair(tdq, steal); | ||||
continue; | goto restart; | ||||
} | } | ||||
/* | /* | ||||
* If a thread was added while interrupts were disabled don't | * Steal the thread and switch to it. | ||||
* steal one here. If we fail to acquire one due to affinity | |||||
* restrictions loop again with this cpu removed from the | |||||
* set. | |||||
*/ | */ | ||||
if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) { | if (tdq_move(steal, tdq) != NULL) | ||||
break; | |||||
/* | |||||
* We failed to acquire a thread even though it looked | |||||
* like one was available. This could be due to affinity | |||||
* restrictions or for other reasons. Loop again after | |||||
* removing this CPU from the set. The restart logic | |||||
* above does not restore this CPU to the set due to the | |||||
* likelyhood of failing here again. | |||||
*/ | |||||
CPU_CLR(cpu, &mask); | |||||
tdq_unlock_pair(tdq, steal); | tdq_unlock_pair(tdq, steal); | ||||
continue; | |||||
} | } | ||||
spinlock_exit(); | |||||
TDQ_UNLOCK(steal); | TDQ_UNLOCK(steal); | ||||
mi_switch(SW_VOL | SWT_IDLE, NULL); | mi_switch(SW_VOL | SWT_IDLE, NULL); | ||||
thread_unlock(curthread); | thread_unlock(curthread); | ||||
return (0); | return (0); | ||||
} | } | ||||
spinlock_exit(); | |||||
return (1); | |||||
} | |||||
/* | /* | ||||
* Notify a remote cpu of new work. Sends an IPI if criteria are met. | * Notify a remote cpu of new work. Sends an IPI if criteria are met. | ||||
*/ | */ | ||||
static void | static void | ||||
tdq_notify(struct tdq *tdq, struct thread *td) | tdq_notify(struct tdq *tdq, struct thread *td) | ||||
{ | { | ||||
struct thread *ctd; | struct thread *ctd; | ||||
▲ Show 20 Lines • Show All 782 Lines • ▼ Show 20 Lines | sched_lend_user_prio(struct thread *td, u_char prio) | ||||
td->td_user_pri = min(prio, td->td_base_user_pri); | td->td_user_pri = min(prio, td->td_base_user_pri); | ||||
if (td->td_priority > td->td_user_pri) | if (td->td_priority > td->td_user_pri) | ||||
sched_prio(td, td->td_user_pri); | sched_prio(td, td->td_user_pri); | ||||
else if (td->td_priority != td->td_user_pri) | else if (td->td_priority != td->td_user_pri) | ||||
td->td_flags |= TDF_NEEDRESCHED; | td->td_flags |= TDF_NEEDRESCHED; | ||||
} | } | ||||
/* | /* | ||||
* This tdq is about to idle. Try to steal a thread from another CPU before | |||||
* choosing the idle thread. | |||||
*/ | |||||
static void | |||||
tdq_trysteal(struct tdq *tdq) | |||||
{ | |||||
struct cpu_group *cg; | |||||
struct tdq *steal; | |||||
cpuset_t mask; | |||||
int cpu, i; | |||||
if (smp_started == 0 || trysteal_limit == 0 || tdq->tdq_cg == NULL) | |||||
return; | |||||
CPU_FILL(&mask); | |||||
CPU_CLR(PCPU_GET(cpuid), &mask); | |||||
/* We don't want to be preempted while we're iterating. */ | |||||
spinlock_enter(); | |||||
TDQ_UNLOCK(tdq); | |||||
for (i = 1, cg = tdq->tdq_cg; ; ) { | |||||
cpu = sched_highest(cg, mask, steal_thresh); | |||||
/* | |||||
* If a thread was added while interrupts were disabled don't | |||||
* steal one here. | |||||
*/ | |||||
if (tdq->tdq_load > 0) { | |||||
TDQ_LOCK(tdq); | |||||
break; | |||||
} | |||||
if (cpu == -1) { | |||||
i++; | |||||
cg = cg->cg_parent; | |||||
if (cg == NULL || i > trysteal_limit) { | |||||
TDQ_LOCK(tdq); | |||||
break; | |||||
} | |||||
continue; | |||||
} | |||||
steal = TDQ_CPU(cpu); | |||||
/* | |||||
* The data returned by sched_highest() is stale and | |||||
* the chosen CPU no longer has an eligible thread. | |||||
*/ | |||||
if (steal->tdq_load < steal_thresh || | |||||
steal->tdq_transferable == 0) | |||||
continue; | |||||
tdq_lock_pair(tdq, steal); | |||||
/* | |||||
* If we get to this point, unconditonally exit the loop | |||||
* to bound the time spent in the critcal section. | |||||
* | |||||
* If a thread was added while interrupts were disabled don't | |||||
* steal one here. | |||||
*/ | |||||
if (tdq->tdq_load > 0) { | |||||
TDQ_UNLOCK(steal); | |||||
break; | |||||
} | |||||
/* | |||||
* The data returned by sched_highest() is stale and | |||||
* the chosen CPU no longer has an eligible thread. | |||||
*/ | |||||
if (steal->tdq_load < steal_thresh || | |||||
steal->tdq_transferable == 0) { | |||||
TDQ_UNLOCK(steal); | |||||
break; | |||||
} | |||||
/* | |||||
* If we fail to acquire one due to affinity restrictions, | |||||
* bail out and let the idle thread to a more complete search | |||||
* outside of a critical section. | |||||
*/ | |||||
if (tdq_move(steal, tdq) == NULL) { | |||||
TDQ_UNLOCK(steal); | |||||
break; | |||||
} | |||||
TDQ_UNLOCK(steal); | |||||
break; | |||||
} | |||||
spinlock_exit(); | |||||
} | |||||
/* | |||||
* Handle migration from sched_switch(). This happens only for | * Handle migration from sched_switch(). This happens only for | ||||
* cpu binding. | * cpu binding. | ||||
*/ | */ | ||||
static struct mtx * | static struct mtx * | ||||
sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) | sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) | ||||
{ | { | ||||
struct tdq *tdn; | struct tdq *tdn; | ||||
▲ Show 20 Lines • Show All 95 Lines • ▼ Show 20 Lines | else { | ||||
("Thread %p shouldn't migrate", td)); | ("Thread %p shouldn't migrate", td)); | ||||
mtx = sched_switch_migrate(tdq, td, srqflag); | mtx = sched_switch_migrate(tdq, td, srqflag); | ||||
} | } | ||||
} else { | } else { | ||||
/* This thread must be going to sleep. */ | /* This thread must be going to sleep. */ | ||||
TDQ_LOCK(tdq); | TDQ_LOCK(tdq); | ||||
mtx = thread_lock_block(td); | mtx = thread_lock_block(td); | ||||
tdq_load_rem(tdq, td); | tdq_load_rem(tdq, td); | ||||
if (tdq->tdq_load == 0) | |||||
tdq_trysteal(tdq); | |||||
} | } | ||||
#if (KTR_COMPILE & KTR_SCHED) != 0 | #if (KTR_COMPILE & KTR_SCHED) != 0 | ||||
if (TD_IS_IDLETHREAD(td)) | if (TD_IS_IDLETHREAD(td)) | ||||
KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", | KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", | ||||
"prio:%d", td->td_priority); | "prio:%d", td->td_priority); | ||||
else | else | ||||
KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), | KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), | ||||
▲ Show 20 Lines • Show All 715 Lines • ▼ Show 20 Lines | sched_idletd(void *dummy) | ||||
for (;;) { | for (;;) { | ||||
if (tdq->tdq_load) { | if (tdq->tdq_load) { | ||||
thread_lock(td); | thread_lock(td); | ||||
mi_switch(SW_VOL | SWT_IDLE, NULL); | mi_switch(SW_VOL | SWT_IDLE, NULL); | ||||
thread_unlock(td); | thread_unlock(td); | ||||
} | } | ||||
switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; | switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; | ||||
#ifdef SMP | #ifdef SMP | ||||
if (switchcnt != oldswitchcnt) { | if (always_steal || switchcnt != oldswitchcnt) { | ||||
oldswitchcnt = switchcnt; | oldswitchcnt = switchcnt; | ||||
if (tdq_idled(tdq) == 0) | if (tdq_idled(tdq) == 0) | ||||
continue; | continue; | ||||
} | } | ||||
switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; | switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; | ||||
#else | #else | ||||
oldswitchcnt = switchcnt; | oldswitchcnt = switchcnt; | ||||
#endif | #endif | ||||
Show All 20 Lines | #endif | ||||
/* Run main MD idle handler. */ | /* Run main MD idle handler. */ | ||||
tdq->tdq_cpu_idle = 1; | tdq->tdq_cpu_idle = 1; | ||||
/* | /* | ||||
* Make sure that tdq_cpu_idle update is globally visible | * Make sure that tdq_cpu_idle update is globally visible | ||||
* before cpu_idle() read tdq_load. The order is important | * before cpu_idle() read tdq_load. The order is important | ||||
* to avoid race with tdq_notify. | * to avoid race with tdq_notify. | ||||
*/ | */ | ||||
atomic_thread_fence_seq_cst(); | atomic_thread_fence_seq_cst(); | ||||
/* | |||||
* Checking for again after the fence picks up assigned | |||||
* threads often enough to make it worthwhile to do so in | |||||
* order to avoid calling cpu_idle(). | |||||
*/ | |||||
if (tdq->tdq_load != 0) { | |||||
tdq->tdq_cpu_idle = 0; | |||||
continue; | |||||
} | |||||
cpu_idle(switchcnt * 4 > sched_idlespinthresh); | cpu_idle(switchcnt * 4 > sched_idlespinthresh); | ||||
tdq->tdq_cpu_idle = 0; | tdq->tdq_cpu_idle = 0; | ||||
/* | /* | ||||
* Account thread-less hardware interrupts and | * Account thread-less hardware interrupts and | ||||
* other wakeup reasons equal to context switches. | * other wakeup reasons equal to context switches. | ||||
*/ | */ | ||||
switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; | switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; | ||||
▲ Show 20 Lines • Show All 218 Lines • ▼ Show 20 Lines | SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, | ||||
"Enables the long-term load balancer"); | "Enables the long-term load balancer"); | ||||
SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, | SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, | ||||
&balance_interval, 0, | &balance_interval, 0, | ||||
"Average period in stathz ticks to run the long-term balancer"); | "Average period in stathz ticks to run the long-term balancer"); | ||||
SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, | SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, | ||||
"Attempts to steal work from other cores before idling"); | "Attempts to steal work from other cores before idling"); | ||||
SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, | SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, | ||||
"Minimum load on remote CPU before we'll steal"); | "Minimum load on remote CPU before we'll steal"); | ||||
SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit, | |||||
0, "Topological distance limit for stealing threads in sched_switch()"); | |||||
SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0, | |||||
"Always run the stealer from the idle thread"); | |||||
SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING | | SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING | | ||||
CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A", | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A", | ||||
"XML dump of detected CPU topology"); | "XML dump of detected CPU topology"); | ||||
#endif | #endif | ||||
/* ps compat. All cpu percentages from ULE are weighted. */ | /* ps compat. All cpu percentages from ULE are weighted. */ | ||||
static int ccpu = 0; | static int ccpu = 0; | ||||
SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); | SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); |
I see a small increase in wall clock time due to this change and an increase in CPU %idle since there are fewer opportunities to steal a thread when looking for a CPU with a tdq_load > 2.