Differential D26011 Diff 79950 sys/kern/subr_sleepqueue.c

Changeset View

Standalone View

sys/kern/subr_sleepqueue.c

Show First 20 Lines • Show All 171 Lines • ▼ Show 20 Lines
#endif		#endif
static int sleepq_init(void *mem, int size, int flags);		static int sleepq_init(void *mem, int size, int flags);
static int sleepq_resume_thread(struct sleepqueue sq, struct thread td,		static int sleepq_resume_thread(struct sleepqueue sq, struct thread td,
int pri, int srqflags);		int pri, int srqflags);
static void sleepq_remove_thread(struct sleepqueue sq, struct thread td);		static void sleepq_remove_thread(struct sleepqueue sq, struct thread td);
static void sleepq_switch(const void *wchan, int pri);		static void sleepq_switch(const void *wchan, int pri);
static void sleepq_timeout(void *arg);		static void sleepq_timeout(void *arg);

		SDT_PROVIDER_DECLARE(sched);
SDT_PROBE_DECLARE(sched, , , sleep);		SDT_PROBE_DECLARE(sched, , , sleep);
SDT_PROBE_DECLARE(sched, , , wakeup);		SDT_PROBE_DECLARE(sched, , , wakeup);

		SDT_PROBE_DEFINE2(sched, , sleepq, usleep, "struct thread *",
		"const char *");
		markjUnsubmitted Not Done Inline Actions Why not put these under the sched provider? I think the proposed names are confusing. "kernel" is too generic, and we already have a "profile" provider. markj: Why not put these under the sched provider? I think the proposed names are confusing. "kernel"…
		SDT_PROBE_DEFINE2(sched, , sleepq, uwakeup, "struct thread *",
		"const char *");
		SDT_PROBE_DEFINE2(sched, , sleepq, ksleep, "struct thread *",
		"const char *");
		SDT_PROBE_DEFINE2(sched, , sleepq, kwakeup, "struct thread *",
		"const char *");

/*		/*
* Initialize SLEEPQUEUE_PROFILING specific sysctl nodes.		* Initialize SLEEPQUEUE_PROFILING specific sysctl nodes.
* Note that it must happen after sleepinit() has been fully executed, so		* Note that it must happen after sleepinit() has been fully executed, so
* it must happen after SI_SUB_KMEM SYSINIT() subsystem setup.		* it must happen after SI_SUB_KMEM SYSINIT() subsystem setup.
*/		*/
#ifdef SLEEPQUEUE_PROFILING		#ifdef SLEEPQUEUE_PROFILING
static void		static void
init_sleepqueue_profiling(void)		init_sleepqueue_profiling(void)
▲ Show 20 Lines • Show All 188 Lines • ▼ Show 20 Lines	#endif
td->td_sqqueue = queue;		td->td_sqqueue = queue;
td->td_wchan = wchan;		td->td_wchan = wchan;
td->td_wmesg = wmesg;		td->td_wmesg = wmesg;
if (flags & SLEEPQ_INTERRUPTIBLE) {		if (flags & SLEEPQ_INTERRUPTIBLE) {
td->td_intrval = 0;		td->td_intrval = 0;
td->td_flags \|= TDF_SINTR;		td->td_flags \|= TDF_SINTR;
}		}
td->td_flags &= ~TDF_TIMEOUT;		td->td_flags &= ~TDF_TIMEOUT;
		if (flags & SLEEPQ_USERWAIT) {
		td->td_flags \|= TDF_USERWAIT;
		}
thread_unlock(td);		thread_unlock(td);
}		}

/*		/*
* Sets a timeout that will remove the current thread from the specified		* Sets a timeout that will remove the current thread from the specified
* sleep queue after timo ticks if the thread has not already been awakened.		* sleep queue after timo ticks if the thread has not already been awakened.
*/		*/
void		void
▲ Show 20 Lines • Show All 205 Lines • ▼ Show 20 Lines	sleepq_switch(const void *wchan, int pri)
}		}
#ifdef SLEEPQUEUE_PROFILING		#ifdef SLEEPQUEUE_PROFILING
if (prof_enabled)		if (prof_enabled)
sleepq_profile(td->td_wmesg);		sleepq_profile(td->td_wmesg);
#endif		#endif
MPASS(td->td_sleepqueue == NULL);		MPASS(td->td_sleepqueue == NULL);
sched_sleep(td, pri);		sched_sleep(td, pri);
thread_lock_set(td, &sc->sc_lock);		thread_lock_set(td, &sc->sc_lock);
SDT_PROBE0(sched, , , sleep);
TD_SET_SLEEPING(td);		TD_SET_SLEEPING(td);
		if (!SDT_PROBES_ENABLED()) {
mi_switch(SW_VOL \| SWT_SLEEPQ);		mi_switch(SW_VOL \| SWT_SLEEPQ);
		} else {
		SDT_PROBE0(sched, , , sleep);
		if ((td->td_flags & TDF_USERWAIT) == 0)
		SDT_PROBE2(sched, , sleepq, ksleep, td,
		td->td_wmesg);
		markjUnsubmitted Not Done Inline Actions Why do we need all of these new probes? You can write sched:::sleep /args[0]->td_flags & 0x80/ to catch user sleeps. Add a constant to /usr/lib/dtrace/sched.d to avoid hard-coding 0x80. Or, add a new parameter to sched:::sleep to indicate what kind fo sleep it is. markj: Why do we need all of these new probes? You can write ``` sched:::sleep /args[0]->td_flags &…
		mjgAuthorUnsubmitted Done Inline Actions I noted in the review description that the intent is to reduce probe effect -- going off/on cpu happens all the time and only fraction of it is of interest. The above filters it out without descending into dtrace. Apart from that, since the points are ts/sleepq aware, more information can be added to them later. mjg: I noted in the review description that the intent is to reduce probe effect -- going off/on cpu…
		markjUnsubmitted Not Done Inline Actions I think you need to demonstrate that the probe effect is severe enough to skew the results. Running your threadspawn benchmark shows hundreds of thousands of context switches per second; with dtrace probes enabled as I suggested I see less than 1% reduction in throughput. sched:::sleep and sched:::on-cpu fire in slow paths by definition. I don't see why a sufficiently large dynvar space wouldn't be enough; if you're getting drops with large dynvarsize then I suspect either a bug in your script, in DTrace or in the placement of sched probes. I also tried a script which checks curthread->td_wmesg against a table of 15 common user sleep points, again with very low overhead. markj: I think you need to demonstrate that the probe effect is severe enough to skew the results.
		mjgAuthorUnsubmitted Done Inline Actions Note that in stock kernel the benchmark still suffers significant problems both single and mult-threaded, largely because of IPIs flying due to kstack page invalidation. I have an experimental patchset which deals with that problem and more, with that in place single-threaded test looks like this: [stock] min:165130 max:165130 total:165130 min:165287 max:165287 total:165287 min:164162 max:164162 total:164162 [dtrace -n 'sched:::on-cpu /(curthread->td_flags & 0x80) == 0/ { @[stack()] = count(); }'] min:124408 max:124408 total:124408 min:124280 max:124280 total:124280 min:123964 max:123964 total:123964 multithreaded this still suffers a lot of contention even with the patchset (to be fixed soon(tm)). However, I indeed can't reproduce dynamic var drops with the current script. It is plausible the old one had a bug there. Still, I think the above justifies not going into dtrace if possible. Regradless, it may be adding the td + lockname/mesg touple to sched:::sleep should be done regardless of the patch. mjg: Note that in stock kernel the benchmark still suffers significant problems both single and mult…
		markjUnsubmitted Not Done Inline Actions That's expensive because it is tracing the stack most of the time. The predicate is wrong because TDF_USERWAIT is cleared before sched:::on-cpu fires. This represents a bug in the patch. If I apply UMA patches to fix the kstack cache, for the single-threaded case I get: [stock] min: 120027 max: 120027 total: 120027 min: 120036 max: 120036 total: 120036 min: 120081 max: 120081 total: 120081 [dtrace -n 'sched:::on-cpu /(curthread->td_flags & 0x80) == 0/ { @[stack()] = count(); }'] min: 85787 max: 85787 total: 85787 min: 85983 max: 85983 total: 85983 min: 85812 max: 85812 total: 85812 [dtrace -n 'sched::sleepq:kwakeup {@[stack()] = count();}'] min: 104631 max: 104631 total: 104631 min: 104761 max: 104761 total: 104761 min: 104667 max: 104667 total: 104667 [dtrace -n 'sched:::sleep {self->t = 1;} sched:::on-cpu /self->t && (curthread->td_flags & 0x80) == 0/{ @[stack()] = count();}'] min: 97170 max: 97170 total: 97170 min: 97438 max: 97438 total: 97438 min: 97440 max: 97440 total: 97440 [dtrace -n 'sched:::sleep {self->t = 1;} sched:::on-cpu /self->t && (curthread->td_flags & 0x80) != 0/{ @[stack()] = count();}'] min: 102709 max: 102709 total: 102709 min: 102366 max: 102366 total: 102366 min: 102635 max: 102635 total: 102635 [https://people.freebsd.org/~markj/dtrace/off-cpu] min: 103237 max: 103237 total: 103237 min: 103268 max: 103268 total: 103268 min: 103509 max: 103509 total: 103509 and this benchmark is an extreme case because every iteration causes the looping thread to context switch. I see a smaller probe effect for other will-it-scale benchmarks. Moreover I see a significant effect simply from enabling any SDT probes, presumably because it forces us to take slow paths in synchronization primitives. markj: That's expensive because it is tracing the stack most of the time. The predicate is wrong…
		else
		SDT_PROBE2(sched, , sleepq, usleep, td,
		td->td_wmesg);
		mi_switch(SW_VOL \| SWT_SLEEPQ);
		if ((td->td_flags & TDF_USERWAIT) == 0)
		SDT_PROBE2(sched, , sleepq, kwakeup, td,
		td->td_wmesg);
		else
		SDT_PROBE2(sched, , sleepq, uwakeup, td,
		td->td_wmesg);
		markjUnsubmitted Not Done Inline Actions Similarly you can use sched:::on-cpu with a td_flags predicate instead of adding new probes here. markj: Similarly you can use sched:::on-cpu with a td_flags predicate instead of adding new probes…
		}
KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));		KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",		CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
(void )td, (long)td->td_proc->p_pid, (void )td->td_name);		(void )td, (long)td->td_proc->p_pid, (void )td->td_name);
}		}

/*		/*
* Check to see if we timed out.		* Check to see if we timed out.
*/		*/
▲ Show 20 Lines • Show All 226 Lines • ▼ Show 20 Lines	if ((td->td_flags & TDF_TIMEOUT) == 0 && td->td_sleeptimo != 0)
* sleepq_timeout() ensure that the thread does not		* sleepq_timeout() ensure that the thread does not
* get spurious wakeups, even if the callout was reset		* get spurious wakeups, even if the callout was reset
* or thread reused.		* or thread reused.
*/		*/
callout_stop(&td->td_slpcallout);		callout_stop(&td->td_slpcallout);

td->td_wmesg = NULL;		td->td_wmesg = NULL;
td->td_wchan = NULL;		td->td_wchan = NULL;
td->td_flags &= ~(TDF_SINTR \| TDF_TIMEOUT);		td->td_flags &= ~(TDF_SINTR \| TDF_TIMEOUT \| TDF_USERWAIT);

CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",		CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",
(void *)td, (long)td->td_proc->p_pid, td->td_name);		(void *)td, (long)td->td_proc->p_pid, td->td_name);
}		}

#ifdef INVARIANTS		#ifdef INVARIANTS
/*		/*
* UMA zone item deallocator.		* UMA zone item deallocator.
▲ Show 20 Lines • Show All 635 Lines • Show Last 20 Lines