diff --git a/sys/amd64/include/runq.h b/sys/amd64/include/runq.h
deleted file mode 100644
--- a/sys/amd64/include/runq.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause
- *
- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef	_MACHINE_RUNQ_H_
-#define	_MACHINE_RUNQ_H_
-
-#define	RQB_LEN		(1)		/* Number of priority status words. */
-#define	RQB_L2BPW	(6)		/* Log2(sizeof(rqb_word_t) * NBBY)). */
-#define	RQB_BPW		(1<<RQB_L2BPW)	/* Bits in an rqb_word_t. */
-
-#define	RQB_BIT(pri)	(1ul << ((pri) & (RQB_BPW - 1)))
-#define	RQB_WORD(pri)	((pri) >> RQB_L2BPW)
-
-#define	RQB_FFS(word)	(bsfq(word))
-
-/*
- * Type of run queue status word.
- */
-typedef	u_int64_t	rqb_word_t;
-
-#endif
diff --git a/sys/arm/include/runq.h b/sys/arm/include/runq.h
deleted file mode 100644
--- a/sys/arm/include/runq.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause
- *
- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef	_MACHINE_RUNQ_H_
-#define	_MACHINE_RUNQ_H_
-
-#define	RQB_LEN		(2)		/* Number of priority status words. */
-#define	RQB_L2BPW	(5)		/* Log2(sizeof(rqb_word_t) * NBBY)). */
-#define	RQB_BPW		(1<<RQB_L2BPW)	/* Bits in an rqb_word_t. */
-
-#define	RQB_BIT(pri)	(1 << ((pri) & (RQB_BPW - 1)))
-#define	RQB_WORD(pri)	((pri) >> RQB_L2BPW)
-
-#define	RQB_FFS(word)	(ffs(word) - 1)
-
-/*
- * Type of run queue status word.
- */
-typedef	u_int32_t	rqb_word_t;
-
-#endif
diff --git a/sys/arm64/include/runq.h b/sys/arm64/include/runq.h
deleted file mode 100644
--- a/sys/arm64/include/runq.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*-
- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifdef __arm__
-#include <arm/runq.h>
-#else /* !__arm__ */
-
-#ifndef	_MACHINE_RUNQ_H_
-#define	_MACHINE_RUNQ_H_
-
-#define	RQB_LEN		(1)		/* Number of priority status words. */
-#define	RQB_L2BPW	(6)		/* Log2(sizeof(rqb_word_t) * NBBY)). */
-#define	RQB_BPW		(1<<RQB_L2BPW)	/* Bits in an rqb_word_t. */
-
-#define	RQB_BIT(pri)	(1ul << ((pri) & (RQB_BPW - 1)))
-#define	RQB_WORD(pri)	((pri) >> RQB_L2BPW)
-
-#define	RQB_FFS(word)	(ffsl(word) - 1)
-
-/*
- * Type of run queue status word.
- */
-typedef	unsigned long	rqb_word_t;
-
-#endif
-
-#endif /* !__arm__ */
diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c
--- a/sys/cam/ctl/ctl.c
+++ b/sys/cam/ctl/ctl.c
@@ -14307,7 +14307,7 @@
 
 	CTL_DEBUG_PRINT(("ctl_work_thread starting\n"));
 	thread_lock(curthread);
-	sched_prio(curthread, PUSER - 1);
+	sched_prio(curthread, PRI_MAX_KERN);
 	thread_unlock(curthread);
 
 	while (!softc->shutdown) {
@@ -14399,7 +14399,7 @@
 
 	CTL_DEBUG_PRINT(("ctl_thresh_thread starting\n"));
 	thread_lock(curthread);
-	sched_prio(curthread, PUSER - 1);
+	sched_prio(curthread, PRI_MAX_KERN);
 	thread_unlock(curthread);
 
 	while (!softc->shutdown) {
diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h
--- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h
+++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h
@@ -44,7 +44,9 @@
 #ifdef _KERNEL
 #define	CPU		curcpu
 #define	minclsyspri	PRIBIO
-#define	defclsyspri minclsyspri
+#define	defclsyspri	minclsyspri
+/* Write issue taskq priority. */
+#define	wtqclsyspri	((PVM + PRIBIO) / 2)
 #define	maxclsyspri	PVM
 #define	max_ncpus	(mp_maxid + 1)
 #define	boot_max_ncpus	(mp_maxid + 1)
diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h b/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h
--- a/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h
@@ -91,8 +91,10 @@
  * Treat shim tasks as SCHED_NORMAL tasks
  */
 #define	minclsyspri			(MAX_PRIO-1)
-#define	maxclsyspri			(MAX_RT_PRIO)
 #define	defclsyspri			(DEFAULT_PRIO)
+/* Write issue taskq priority. */
+#define	wtqclsyspri			(MAX_RT_PRIO + 1)
+#define	maxclsyspri			(MAX_RT_PRIO)
 
 #ifndef NICE_TO_PRIO
 #define	NICE_TO_PRIO(nice)		(MAX_RT_PRIO + (nice) + 20)
diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h
--- a/sys/contrib/openzfs/include/sys/zfs_context.h
+++ b/sys/contrib/openzfs/include/sys/zfs_context.h
@@ -622,8 +622,10 @@
  * Process priorities as defined by setpriority(2) and getpriority(2).
  */
 #define	minclsyspri	19
-#define	maxclsyspri	-20
 #define	defclsyspri	0
+/* Write issue taskq priority. */
+#define	wtqclsyspri	-19
+#define	maxclsyspri	-20
 
 #define	CPU_SEQID	((uintptr_t)pthread_self() & (max_ncpus - 1))
 #define	CPU_SEQID_UNSTABLE	CPU_SEQID
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -1158,29 +1158,14 @@
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
 #endif
-			pri_t pri = maxclsyspri;
 			/*
 			 * The write issue taskq can be extremely CPU
 			 * intensive.  Run it at slightly less important
 			 * priority than the other taskqs.
-			 *
-			 * Under Linux and FreeBSD this means incrementing
-			 * the priority value as opposed to platforms like
-			 * illumos where it should be decremented.
-			 *
-			 * On FreeBSD, if priorities divided by four (RQ_PPQ)
-			 * are equal then a difference between them is
-			 * insignificant.
 			 */
-			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
-#if defined(__linux__)
-				pri++;
-#elif defined(__FreeBSD__)
-				pri += 4;
-#else
-#error "unknown OS"
-#endif
-			}
+			const pri_t pri = (t == ZIO_TYPE_WRITE &&
+			    q == ZIO_TASKQ_ISSUE) ?
+			    wtqclsyspri : maxclsyspri;
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
 #ifdef HAVE_SYSDC
diff --git a/sys/dev/beri/beri_ring.c b/sys/dev/beri/beri_ring.c
--- a/sys/dev/beri/beri_ring.c
+++ b/sys/dev/beri/beri_ring.c
@@ -170,7 +170,7 @@
 	}
 
 	mtx_lock(&sc->beri_mtx);
-	selwakeuppri(&sc->beri_rsel, PZERO + 1);
+	selwakeuppri(&sc->beri_rsel, PZERO);
 	KNOTE_LOCKED(&sc->beri_rsel.si_note, 0);
 	mtx_unlock(&sc->beri_mtx);
 }
@@ -190,7 +190,7 @@
 	}
 
 	mtx_lock(&sc->beri_mtx);
-	selwakeuppri(&sc->beri_rsel, PZERO + 1);
+	selwakeuppri(&sc->beri_rsel, PZERO);
 	KNOTE_LOCKED(&sc->beri_rsel.si_note, 0);
 	mtx_unlock(&sc->beri_mtx);
 }
diff --git a/sys/dev/firewire/firewirereg.h b/sys/dev/firewire/firewirereg.h
--- a/sys/dev/firewire/firewirereg.h
+++ b/sys/dev/firewire/firewirereg.h
@@ -293,7 +293,7 @@
 extern devclass_t firewire_devclass;
 extern int firewire_phydma_enable;
 
-#define	FWPRI		((PZERO + 8) | PCATCH)
+#define	FWPRI		(PWAIT | PCATCH)
 
 #define CALLOUT_INIT(x) callout_init(x, 1 /* mpsafe */)
 
diff --git a/sys/dev/syscons/syscons.c b/sys/dev/syscons/syscons.c
--- a/sys/dev/syscons/syscons.c
+++ b/sys/dev/syscons/syscons.c
@@ -1310,7 +1310,7 @@
 		if (i == sc->cur_scp->index)
 			return 0;
 		error =
-		    tsleep(VTY_WCHAN(sc, i), (PZERO + 1) | PCATCH, "waitvt", 0);
+		    tsleep(VTY_WCHAN(sc, i), PZERO | PCATCH, "waitvt", 0);
 		return error;
 
 	case VT_GETACTIVE: /* get active vty # */
diff --git a/sys/dev/usb/usb_process.h b/sys/dev/usb/usb_process.h
--- a/sys/dev/usb/usb_process.h
+++ b/sys/dev/usb/usb_process.h
@@ -31,7 +31,6 @@
 #ifndef USB_GLOBAL_INCLUDE_FILE
 #include <sys/interrupt.h>
 #include <sys/priority.h>
-#include <sys/runq.h>
 #endif
 
 /* defines */
diff --git a/sys/dev/vkbd/vkbd.c b/sys/dev/vkbd/vkbd.c
--- a/sys/dev/vkbd/vkbd.c
+++ b/sys/dev/vkbd/vkbd.c
@@ -82,7 +82,7 @@
 #define VKBD_UNLOCK(s)		mtx_unlock(&(s)->ks_lock)
 #define VKBD_LOCK_ASSERT(s, w)	mtx_assert(&(s)->ks_lock, w)
 #define VKBD_SLEEP(s, f, d, t) \
-	msleep(&(s)->f, &(s)->ks_lock, PCATCH | (PZERO + 1), d, t)
+	msleep(&(s)->f, &(s)->ks_lock, PCATCH | PZERO, d, t)
 #else
 #define VKBD_LOCK_DECL
 #define VKBD_LOCK_INIT(s)
@@ -90,7 +90,7 @@
 #define VKBD_LOCK(s)
 #define VKBD_UNLOCK(s)
 #define VKBD_LOCK_ASSERT(s, w)
-#define VKBD_SLEEP(s, f, d, t)	tsleep(&(s)->f, PCATCH | (PZERO + 1), d, t)
+#define VKBD_SLEEP(s, f, d, t)	tsleep(&(s)->f, PCATCH | PZERO, d, t)
 #endif
 
 #define VKBD_KEYBOARD(d) \
@@ -268,8 +268,8 @@
 		VKBD_SLEEP(state, ks_task, "vkbdc", 0);
 
 	/* wakeup poll()ers */
-	selwakeuppri(&state->ks_rsel, PZERO + 1);
-	selwakeuppri(&state->ks_wsel, PZERO + 1);
+	selwakeuppri(&state->ks_rsel, PZERO);
+	selwakeuppri(&state->ks_wsel, PZERO);
 
 	state->ks_flags &= ~OPEN;
 	state->ks_dev = NULL;
@@ -498,7 +498,7 @@
 
 	if (!(state->ks_flags & STATUS)) {
 		state->ks_flags |= STATUS;
-		selwakeuppri(&state->ks_rsel, PZERO + 1);
+		selwakeuppri(&state->ks_rsel, PZERO);
 		wakeup(&state->ks_flags);
 	}
 }
@@ -531,7 +531,7 @@
 		q->head = 0;
 
 	/* wakeup ks_inq writers/poll()ers */
-	selwakeuppri(&state->ks_wsel, PZERO + 1);
+	selwakeuppri(&state->ks_wsel, PZERO);
 	wakeup(q);
 
 	return (c);
@@ -1246,7 +1246,7 @@
 
 	/* flush ks_inq and wakeup writers/poll()ers */
 	state->ks_inq.head = state->ks_inq.tail = state->ks_inq.cc = 0;
-	selwakeuppri(&state->ks_wsel, PZERO + 1);
+	selwakeuppri(&state->ks_wsel, PZERO);
 	wakeup(&state->ks_inq);
 }
 
diff --git a/sys/fs/fuse/fuse_device.c b/sys/fs/fuse/fuse_device.c
--- a/sys/fs/fuse/fuse_device.c
+++ b/sys/fs/fuse/fuse_device.c
@@ -152,7 +152,7 @@
 	FUSE_LOCK();
 	fuse_lck_mtx_lock(fdata->aw_mtx);
 	/* wakup poll()ers */
-	selwakeuppri(&fdata->ks_rsel, PZERO + 1);
+	selwakeuppri(&fdata->ks_rsel, PZERO);
 	/* Don't let syscall handlers wait in vain */
 	while ((tick = fuse_aw_pop(fdata))) {
 		fuse_lck_mtx_lock(tick->tk_aw_mtx);
diff --git a/sys/fs/fuse/fuse_io.c b/sys/fs/fuse/fuse_io.c
--- a/sys/fs/fuse/fuse_io.c
+++ b/sys/fs/fuse/fuse_io.c
@@ -932,7 +932,7 @@
 		if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF)
 			return EIO;
 		fvdat->flag |= FN_FLUSHWANT;
-		tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz);
+		tsleep(&fvdat->flag, PRIBIO, "fusevinv", 2 * hz);
 		error = 0;
 		if (p != NULL) {
 			PROC_LOCK(p);
diff --git a/sys/fs/fuse/fuse_ipc.c b/sys/fs/fuse/fuse_ipc.c
--- a/sys/fs/fuse/fuse_ipc.c
+++ b/sys/fs/fuse/fuse_ipc.c
@@ -593,7 +593,7 @@
 	fuse_lck_mtx_lock(data->ms_mtx);
 	data->dataflags |= FSESS_DEAD;
 	wakeup_one(data);
-	selwakeuppri(&data->ks_rsel, PZERO + 1);
+	selwakeuppri(&data->ks_rsel, PZERO);
 	wakeup(&data->ticketer);
 	fuse_lck_mtx_unlock(data->ms_mtx);
 	FUSE_UNLOCK();
@@ -669,7 +669,7 @@
 	else
 		fuse_ms_push(ftick);
 	wakeup_one(ftick->tk_data);
-	selwakeuppri(&ftick->tk_data->ks_rsel, PZERO + 1);
+	selwakeuppri(&ftick->tk_data->ks_rsel, PZERO);
 	KNOTE_LOCKED(&ftick->tk_data->ks_rsel.si_note, 0);
 	fuse_lck_mtx_unlock(ftick->tk_data->ms_mtx);
 }
diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c
--- a/sys/fs/nfs/nfs_commonsubs.c
+++ b/sys/fs/nfs/nfs_commonsubs.c
@@ -4644,7 +4644,7 @@
 		ts.tv_sec = 0;
 		ts.tv_nsec = 0;
 		(void) nfsmsleep((caddr_t)flagp, NFSSOCKMUTEXPTR,
-		    PZERO - 1, "nfsndlck", &ts);
+		    PVFS, "nfsndlck", &ts);
 	}
 	*flagp |= NFSR_SNDLOCK;
 	NFSUNLOCKSOCK();
diff --git a/sys/fs/nfsserver/nfs_nfsdcache.c b/sys/fs/nfsserver/nfs_nfsdcache.c
--- a/sys/fs/nfsserver/nfs_nfsdcache.c
+++ b/sys/fs/nfsserver/nfs_nfsdcache.c
@@ -392,7 +392,7 @@
 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
 			if ((rp->rc_flag & RC_LOCKED) != 0) {
 				rp->rc_flag |= RC_WANTED;
-				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
+				(void)mtx_sleep(rp, mutex, PVFS | PDROP,
 				    "nfsrc", 10 * hz);
 				goto loop;
 			}
@@ -678,7 +678,7 @@
 		rp = hitrp;
 		if ((rp->rc_flag & RC_LOCKED) != 0) {
 			rp->rc_flag |= RC_WANTED;
-			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
+			(void)mtx_sleep(rp, mutex, PVFS | PDROP,
 			    "nfsrc", 10 * hz);
 			goto tryagain;
 		}
@@ -750,7 +750,7 @@
 	mtx_assert(mutex, MA_OWNED);
 	while ((rp->rc_flag & RC_LOCKED) != 0) {
 		rp->rc_flag |= RC_WANTED;
-		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
+		(void)mtx_sleep(rp, mutex, PVFS, "nfsrc", 0);
 	}
 	rp->rc_flag |= RC_LOCKED;
 }
diff --git a/sys/fs/nfsserver/nfs_nfsdstate.c b/sys/fs/nfsserver/nfs_nfsdstate.c
--- a/sys/fs/nfsserver/nfs_nfsdstate.c
+++ b/sys/fs/nfsserver/nfs_nfsdstate.c
@@ -507,7 +507,7 @@
 		NFSLOCKSTATE();
 		while (clp->lc_cbref) {
 			clp->lc_flags |= LCL_WAKEUPWANTED;
-			(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
+			(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PVFS,
 			    "nfsd clp", 10 * hz);
 		}
 		NFSUNLOCKSTATE();
@@ -574,7 +574,7 @@
 		NFSLOCKSTATE();
 		while (clp->lc_cbref) {
 			clp->lc_flags |= LCL_WAKEUPWANTED;
-			(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
+			(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PVFS,
 			    "nfsdclp", 10 * hz);
 		}
 		NFSUNLOCKSTATE();
diff --git a/sys/fs/smbfs/smbfs_io.c b/sys/fs/smbfs/smbfs_io.c
--- a/sys/fs/smbfs/smbfs_io.c
+++ b/sys/fs/smbfs/smbfs_io.c
@@ -629,7 +629,7 @@
 
 	while (np->n_flag & NFLUSHINPROG) {
 		np->n_flag |= NFLUSHWANT;
-		error = tsleep(&np->n_flag, PRIBIO + 2, "smfsvinv", 2 * hz);
+		error = tsleep(&np->n_flag, PRIBIO, "smfsvinv", 2 * hz);
 		error = smb_td_intr(td);
 		if (error == EINTR)
 			return EINTR;
diff --git a/sys/i386/include/runq.h b/sys/i386/include/runq.h
deleted file mode 100644
--- a/sys/i386/include/runq.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause
- *
- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef	_MACHINE_RUNQ_H_
-#define	_MACHINE_RUNQ_H_
-
-#define	RQB_LEN		(2)		/* Number of priority status words. */
-#define	RQB_L2BPW	(5)		/* Log2(sizeof(rqb_word_t) * NBBY)). */
-#define	RQB_BPW		(1<<RQB_L2BPW)	/* Bits in an rqb_word_t. */
-
-#define	RQB_BIT(pri)	(1 << ((pri) & (RQB_BPW - 1)))
-#define	RQB_WORD(pri)	((pri) >> RQB_L2BPW)
-
-#define	RQB_FFS(word)	(ffs(word) - 1)
-
-/*
- * Type of run queue status word.
- */
-typedef	u_int32_t	rqb_word_t;
-
-#endif
diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c
--- a/sys/kern/kern_rmlock.c
+++ b/sys/kern/kern_rmlock.c
@@ -1010,7 +1010,8 @@
 
 	mtx_lock(&rms->mtx);
 	while (rms->writers > 0)
-		msleep(&rms->readers, &rms->mtx, PUSER - 1, mtx_name(&rms->mtx), 0);
+		msleep(&rms->readers, &rms->mtx, PRI_MAX_KERN,
+		    mtx_name(&rms->mtx), 0);
 	critical_enter();
 	rms_int_readers_inc(rms, rms_int_pcpu(rms));
 	mtx_unlock(&rms->mtx);
@@ -1197,7 +1198,7 @@
 	mtx_lock(&rms->mtx);
 	rms->writers++;
 	if (rms->writers > 1) {
-		msleep(&rms->owner, &rms->mtx, (PUSER - 1),
+		msleep(&rms->owner, &rms->mtx, PRI_MAX_KERN,
 		    mtx_name(&rms->mtx), 0);
 		MPASS(rms->readers == 0);
 		KASSERT(rms->owner == RMS_TRANSIENT,
@@ -1213,7 +1214,7 @@
 	rms_assert_no_pcpu_readers(rms);
 
 	if (rms->readers > 0) {
-		msleep(&rms->writers, &rms->mtx, (PUSER - 1),
+		msleep(&rms->writers, &rms->mtx, PRI_MAX_KERN,
 		    mtx_name(&rms->mtx), 0);
 	}
 
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@@ -38,6 +38,7 @@
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
+#include <sys/runq.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
@@ -57,8 +58,6 @@
 #endif
 #endif
 
-CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
-
 /*
  * kern.sched.preemption allows user space to determine if preemption support
  * is compiled in or not.  It is not currently a boot or runtime flag that
@@ -253,6 +252,35 @@
 /************************************************************************
  * SYSTEM RUN QUEUE manipulations and tests				*
  ************************************************************************/
+_Static_assert(RQSW_BPW == (1 << RQSW_L2BPW),
+    "RQSW_L2BPW and RQSW_BPW / 'rqsw_t' mismatch");
+_Static_assert(RQ_NQS <= 256,
+    "'td_rqindex' must be turned into a bigger unsigned type");
+/* A macro instead of a function to get the proper calling function's name. */
+#define CHECK_IDX(idx) ({						\
+	__typeof(idx) _idx __unused = (idx);					\
+	KASSERT(0 <= _idx && _idx < RQ_NQS,				\
+	    ("%s: %s out of range: %d", __func__, __STRING(idx), _idx)); \
+})
+
+/* Status words' individual bit manipulators' internals. */
+typedef uintptr_t	runq_sw_op(int idx, int sw_idx, rqsw_t sw_bit,
+			    rqsw_t *swp);
+static inline uintptr_t	runq_sw_apply(struct runq *rq, int idx,
+			    runq_sw_op *op);
+
+static inline uintptr_t	runq_sw_set_not_empty_op(int idx, int sw_idx,
+			    rqsw_t sw_bit, rqsw_t *swp);
+static inline uintptr_t	runq_sw_set_empty_op(int idx, int sw_idx,
+			    rqsw_t sw_bit, rqsw_t *swp);
+static inline uintptr_t	runq_sw_is_empty_op(int idx, int sw_idx,
+			    rqsw_t sw_bit, rqsw_t *swp);
+
+/* Status words' individual bit manipulators. */
+static inline void	runq_sw_set_not_empty(struct runq *rq, int idx);
+static inline void	runq_sw_set_empty(struct runq *rq, int idx);
+static inline bool	runq_sw_is_empty(struct runq *rq, int idx);
+
 /*
  * Initialize a run structure.
  */
@@ -261,98 +289,96 @@
 {
 	int i;
 
-	bzero(rq, sizeof *rq);
+	bzero(rq, sizeof(*rq));
 	for (i = 0; i < RQ_NQS; i++)
 		TAILQ_INIT(&rq->rq_queues[i]);
 }
 
 /*
- * Clear the status bit of the queue corresponding to priority level pri,
- * indicating that it is empty.
+ * Helper to implement functions operating on a particular status word bit.
+ *
+ * The operator is passed the initial 'idx', the corresponding status word index
+ * in 'rq_status' in 'sw_idx', a status word with only that bit set in 'sw_bit'
+ * and a pointer to the corresponding status word in 'swp'.
  */
-static __inline void
-runq_clrbit(struct runq *rq, int pri)
+static inline uintptr_t
+runq_sw_apply(struct runq *rq, int idx, runq_sw_op *op)
 {
-	struct rqbits *rqb;
+	rqsw_t *swp;
+	rqsw_t sw_bit;
+	int sw_idx;
 
-	rqb = &rq->rq_status;
-	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
-	    rqb->rqb_bits[RQB_WORD(pri)],
-	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
-	    RQB_BIT(pri), RQB_WORD(pri));
-	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
+	CHECK_IDX(idx);
+
+	sw_idx = RQSW_IDX(idx);
+	sw_bit = RQSW_BIT(idx);
+	swp = &rq->rq_status.rq_sw[sw_idx];
+
+	return (op(idx, sw_idx, sw_bit, swp));
 }
 
 /*
- * Find the index of the first non-empty run queue.  This is done by
- * scanning the status bits, a set bit indicates a non-empty queue.
+ * Modify the status words to indicate that some queue is not empty.
+ *
+ * Sets the status bit corresponding to the queue at index 'idx'.
  */
-static __inline int
-runq_findbit(struct runq *rq)
+static inline uintptr_t
+runq_sw_set_not_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
 {
-	struct rqbits *rqb;
-	int pri;
-	int i;
+	rqsw_t old_sw __unused = *swp;
 
-	rqb = &rq->rq_status;
-	for (i = 0; i < RQB_LEN; i++)
-		if (rqb->rqb_bits[i]) {
-			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
-			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
-			    rqb->rqb_bits[i], i, pri);
-			return (pri);
-		}
-
-	return (-1);
+	*swp |= sw_bit;
+	CTR4(KTR_RUNQ, "runq_sw_set_not_empty: idx=%d sw_idx=%d bits=%#x->%#x",
+	    idx, sw_idx, old_sw, *swp);
+	return (0);
 }
-
-static __inline int
-runq_findbit_from(struct runq *rq, u_char pri)
+static inline void
+runq_sw_set_not_empty(struct runq *rq, int idx)
 {
-	struct rqbits *rqb;
-	rqb_word_t mask;
-	int i;
-
-	/*
-	 * Set the mask for the first word so we ignore priorities before 'pri'.
-	 */
-	mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
-	rqb = &rq->rq_status;
-again:
-	for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
-		mask = rqb->rqb_bits[i] & mask;
-		if (mask == 0)
-			continue;
-		pri = RQB_FFS(mask) + (i << RQB_L2BPW);
-		CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
-		    mask, i, pri);
-		return (pri);
-	}
-	if (pri == 0)
-		return (-1);
-	/*
-	 * Wrap back around to the beginning of the list just once so we
-	 * scan the whole thing.
-	 */
-	pri = 0;
-	goto again;
+	(void)runq_sw_apply(rq, idx, &runq_sw_set_not_empty_op);
 }
 
 /*
- * Set the status bit of the queue corresponding to priority level pri,
- * indicating that it is non-empty.
+ * Modify the status words to indicate that some queue is empty.
+ *
+ * Clears the status bit corresponding to the queue at index 'idx'.
  */
-static __inline void
-runq_setbit(struct runq *rq, int pri)
+static inline uintptr_t
+runq_sw_set_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
 {
-	struct rqbits *rqb;
+	rqsw_t old_sw __unused = *swp;
 
-	rqb = &rq->rq_status;
-	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
-	    rqb->rqb_bits[RQB_WORD(pri)],
-	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
-	    RQB_BIT(pri), RQB_WORD(pri));
-	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
+	*swp &= ~sw_bit;
+	CTR4(KTR_RUNQ, "runq_sw_set_empty: idx=%d sw_idx=%d bits=%#x->%#x",
+	    idx, sw_idx, old_sw, *swp);
+	return (0);
+}
+static inline void
+runq_sw_set_empty(struct runq *rq, int idx)
+{
+	(void)runq_sw_apply(rq, idx, &runq_sw_set_empty_op);
+}
+
+/*
+ * Returns whether the status words indicate that some queue is empty.
+ */
+static inline uintptr_t
+runq_sw_is_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
+{
+	return ((*swp & sw_bit) == 0);
+}
+static inline bool
+runq_sw_is_empty(struct runq *rq, int idx)
+{
+	return (runq_sw_apply(rq, idx, &runq_sw_is_empty_op));
+}
+
+/*
+ * Returns whether a particular queue is empty.
+ */
+bool runq_is_queue_empty(struct runq *rq, int idx)
+{
+	return (runq_sw_is_empty(rq, idx));
 }
 
 /*
@@ -362,102 +388,183 @@
 void
 runq_add(struct runq *rq, struct thread *td, int flags)
 {
-	struct rqhead *rqh;
-	int pri;
 
-	pri = td->td_priority / RQ_PPQ;
-	td->td_rqindex = pri;
-	runq_setbit(rq, pri);
-	rqh = &rq->rq_queues[pri];
-	CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
-	    td, td->td_priority, pri, rqh);
-	if (flags & SRQ_PREEMPTED) {
-		TAILQ_INSERT_HEAD(rqh, td, td_runq);
-	} else {
-		TAILQ_INSERT_TAIL(rqh, td, td_runq);
-	}
+	runq_add_idx(rq, td, RQ_PRI_TO_QUEUE_IDX(td->td_priority), flags);
 }
 
 void
-runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
+runq_add_idx(struct runq *rq, struct thread *td, int idx, int flags)
 {
-	struct rqhead *rqh;
+	struct rq_queue *rqq;
 
-	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
-	td->td_rqindex = pri;
-	runq_setbit(rq, pri);
-	rqh = &rq->rq_queues[pri];
-	CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
-	    td, td->td_priority, pri, rqh);
-	if (flags & SRQ_PREEMPTED) {
-		TAILQ_INSERT_HEAD(rqh, td, td_runq);
-	} else {
-		TAILQ_INSERT_TAIL(rqh, td, td_runq);
-	}
+	/*
+	 * runq_sw_*() functions assert that 'idx' is non-negative and below
+	 * 'RQ_NQS', and a static assert upper in this file ensures that
+	 * 'RQ_NQS' is no more than 256.
+	 */
+	td->td_rqindex = idx;
+	runq_sw_set_not_empty(rq, idx);
+	rqq = &rq->rq_queues[idx];
+	CTR4(KTR_RUNQ, "runq_add_idx: td=%p pri=%d idx=%d rqq=%p",
+	    td, td->td_priority, idx, rqq);
+	if (flags & SRQ_PREEMPTED)
+		TAILQ_INSERT_HEAD(rqq, td, td_runq);
+	else
+		TAILQ_INSERT_TAIL(rqq, td, td_runq);
 }
+
 /*
- * Return true if there are runnable processes of any priority on the run
- * queue, false otherwise.  Has no side effects, does not modify the run
- * queue structure.
+ * Remove the thread from the queue specified by its priority, and clear the
+ * corresponding status bit if the queue becomes empty.
+ *
+ * Returns whether the corresponding queue is empty after removal.
+ */
+bool
+runq_remove(struct runq *rq, struct thread *td)
+{
+	struct rq_queue *rqq;
+	int idx;
+
+	KASSERT(td->td_flags & TDF_INMEM, ("runq_remove: Thread swapped out"));
+	idx = td->td_rqindex;
+	CHECK_IDX(idx);
+	rqq = &rq->rq_queues[idx];
+	CTR4(KTR_RUNQ, "runq_remove: td=%p pri=%d idx=%d rqq=%p",
+	    td, td->td_priority, idx, rqq);
+	TAILQ_REMOVE(rqq, td, td_runq);
+	if (TAILQ_EMPTY(rqq)) {
+		runq_sw_set_empty(rq, idx);
+		CTR1(KTR_RUNQ, "runq_remove: queue at idx=%d now empty", idx);
+		return (true);
+	}
+	return (false);
+}
+
+static inline int
+runq_findq_status_word(struct runq *const rq, const int w_idx,
+    const rqsw_t w, runq_pred_t *const pred, void *const pred_data)
+{
+	struct rq_queue *q;
+	rqsw_t tw = w;
+	int idx, b_idx;
+
+	while (tw != 0) {
+		b_idx = RQSW_BSF(tw);
+		idx = RQSW_TO_QUEUE_IDX(w_idx, b_idx);
+		q = &rq->rq_queues[idx];
+		KASSERT(!TAILQ_EMPTY(q),
+		    ("runq_findq(): No thread on non-empty queue with idx=%d",
+		    idx));
+		if (pred(idx, q, pred_data))
+			return (idx);
+		tw &= ~RQSW_BIT(idx);
+	}
+
+	return (-1);
+}
+
+/*
+ * Find in the passed range (bounds included) the index of the first (i.e.,
+ * having lower index) non-empty queue that passes pred().
+ *
+ * Considered queues are those with index 'lvl_min' up to 'lvl_max' (bounds
+ * included).  If no queue matches, returns -1.
+ *
+ * This is done by scanning the status words (a set bit indicates a non-empty
+ * queue) and calling pred() with corresponding queue indices.  pred() must
+ * return whether the corresponding queue is accepted.  It is passed private
+ * data through 'pred_data', which can be used both for extra input and output.
  */
 int
-runq_check(struct runq *rq)
+runq_findq(struct runq *const rq, const int lvl_min, const int lvl_max,
+    runq_pred_t *const pred, void *const pred_data)
 {
-	struct rqbits *rqb;
-	int i;
+	rqsw_t const (*const rqsw)[RQSW_NB] = &rq->rq_status.rq_sw;
+	rqsw_t w;
+	int i, last, idx;
 
-	rqb = &rq->rq_status;
-	for (i = 0; i < RQB_LEN; i++)
-		if (rqb->rqb_bits[i]) {
-			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
-			    rqb->rqb_bits[i], i);
-			return (1);
-		}
-	CTR0(KTR_RUNQ, "runq_check: empty");
+	CHECK_IDX(lvl_min);
+	CHECK_IDX(lvl_max);
+	KASSERT(lvl_min <= lvl_max,
+	    ("lvl_min: %d > lvl_max: %d!", lvl_min, lvl_max));
 
-	return (0);
+	i = RQSW_IDX(lvl_min);
+	last = RQSW_IDX(lvl_max);
+	/* Clear bits for runqueues below 'lvl_min'. */
+	w = (*rqsw)[i] & ~(RQSW_BIT(lvl_min) - 1);
+	if (i == last)
+		goto last_mask;
+	idx = runq_findq_status_word(rq, i, w, pred, pred_data);
+	if (idx != -1)
+		goto return_idx;
+
+	for (++i; i < last; ++i) {
+		w = (*rqsw)[i];
+		idx = runq_findq_status_word(rq, i, w, pred, pred_data);
+		if (idx != -1)
+			goto return_idx;
+	}
+
+	MPASS(i == last);
+	w = (*rqsw)[i];
+last_mask:
+	/* Clear bits for runqueues above 'lvl_max'. */
+	w &= (RQSW_BIT(lvl_max) - 1) | RQSW_BIT(lvl_max);
+	idx = runq_findq_status_word(rq, i, w, pred, pred_data);
+	if (idx != -1)
+		goto return_idx;
+	return (-1);
+return_idx:
+	CTR4(KTR_RUNQ, "runq_findq: bits=%#x->%#x i=%d idx=%d",
+	    (*rqsw)[i], w, i, idx);
+	return (idx);
+}
+
+static bool
+runq_first_thread_pred(const int idx, struct rq_queue *const q, void *const data)
+{
+	struct thread **const tdp = data;
+	struct thread *const td = TAILQ_FIRST(q);
+
+	*tdp = td;
+	return (true);
+}
+
+/* Make sure it has an external definition. */
+extern inline struct thread *
+runq_first_thread_range(struct runq *const rq, const int lvl_min,
+    const int lvl_max)
+{
+	struct thread *td = NULL;
+
+	(void)runq_findq(rq, lvl_min, lvl_max, runq_first_thread_pred, &td);
+	return (td);
+}
+
+static inline struct thread *
+runq_first_thread(struct runq *const rq)
+{
+
+	return (runq_first_thread_range(rq, 0, RQ_NQS - 1));
 }
 
 /*
- * Find the highest priority process on the run queue.
+ * Return true if there are some processes of any priority on the run queue,
+ * false otherwise.  Has no side effects.
  */
-struct thread *
-runq_choose_fuzz(struct runq *rq, int fuzz)
+bool
+runq_not_empty(struct runq *rq)
 {
-	struct rqhead *rqh;
-	struct thread *td;
-	int pri;
+	struct thread *const td = runq_first_thread(rq);
 
-	while ((pri = runq_findbit(rq)) != -1) {
-		rqh = &rq->rq_queues[pri];
-		/* fuzz == 1 is normal.. 0 or less are ignored */
-		if (fuzz > 1) {
-			/*
-			 * In the first couple of entries, check if
-			 * there is one for our CPU as a preference.
-			 */
-			int count = fuzz;
-			int cpu = PCPU_GET(cpuid);
-			struct thread *td2;
-			td2 = td = TAILQ_FIRST(rqh);
-
-			while (count-- && td2) {
-				if (td2->td_lastcpu == cpu) {
-					td = td2;
-					break;
-				}
-				td2 = TAILQ_NEXT(td2, td_runq);
-			}
-		} else
-			td = TAILQ_FIRST(rqh);
-		KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
-		CTR3(KTR_RUNQ,
-		    "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
-		return (td);
+	if (td != NULL) {
+		CTR2(KTR_RUNQ, "runq_not_empty: idx=%d, td=%p",
+		    td->td_rqindex, td);
+		return (true);
 	}
-	CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
 
-	return (NULL);
+	CTR0(KTR_RUNQ, "runq_not_empty: empty");
+	return (false);
 }
 
 /*
@@ -466,73 +573,74 @@
 struct thread *
 runq_choose(struct runq *rq)
 {
-	struct rqhead *rqh;
 	struct thread *td;
-	int pri;
 
-	while ((pri = runq_findbit(rq)) != -1) {
-		rqh = &rq->rq_queues[pri];
-		td = TAILQ_FIRST(rqh);
-		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
-		CTR3(KTR_RUNQ,
-		    "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
+	td = runq_first_thread(rq);
+	if (td != NULL) {
+		CTR2(KTR_RUNQ, "runq_choose: idx=%d td=%p", td->td_rqindex, td);
 		return (td);
 	}
-	CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
 
+	CTR0(KTR_RUNQ, "runq_choose: idlethread");
 	return (NULL);
 }
 
-struct thread *
-runq_choose_from(struct runq *rq, u_char idx)
+struct runq_fuzz_pred_data {
+	int fuzz;
+	struct thread *td;
+};
+
+static bool
+runq_fuzz_pred(const int idx, struct rq_queue *const q, void *const data)
 {
-	struct rqhead *rqh;
+	struct runq_fuzz_pred_data *const d = data;
+	const int fuzz = d->fuzz;
 	struct thread *td;
-	int pri;
 
-	if ((pri = runq_findbit_from(rq, idx)) != -1) {
-		rqh = &rq->rq_queues[pri];
-		td = TAILQ_FIRST(rqh);
-		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
-		CTR4(KTR_RUNQ,
-		    "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
-		    pri, td, td->td_rqindex, rqh);
-		return (td);
+	td = TAILQ_FIRST(q);
+
+	if (fuzz > 1) {
+		/*
+		 * In the first couple of entries, check if
+		 * there is one for our CPU as a preference.
+		 */
+		struct thread *td2 = td;
+		int count = fuzz;
+		int cpu = PCPU_GET(cpuid);
+
+		while (count-- != 0 && td2 != NULL) {
+			if (td2->td_lastcpu == cpu) {
+				td = td2;
+				break;
+			}
+			td2 = TAILQ_NEXT(td2, td_runq);
+		}
 	}
-	CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
 
-	return (NULL);
+	d->td = td;
+	return (true);
 }
+
 /*
- * Remove the thread from the queue specified by its priority, and clear the
- * corresponding status bit if the queue becomes empty.
- * Caller must set state afterwards.
+ * Find the highest priority process on the run queue.
  */
-void
-runq_remove(struct runq *rq, struct thread *td)
+struct thread *
+runq_choose_fuzz(struct runq *rq, int fuzz)
 {
+	struct runq_fuzz_pred_data data = {
+		.fuzz = fuzz,
+		.td = NULL
+	};
+	int idx;
 
-	runq_remove_idx(rq, td, NULL);
-}
-
-void
-runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
-{
-	struct rqhead *rqh;
-	u_char pri;
-
-	KASSERT(td->td_flags & TDF_INMEM,
-		("runq_remove_idx: thread swapped out"));
-	pri = td->td_rqindex;
-	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
-	rqh = &rq->rq_queues[pri];
-	CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
-	    td, td->td_priority, pri, rqh);
-	TAILQ_REMOVE(rqh, td, td_runq);
-	if (TAILQ_EMPTY(rqh)) {
-		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
-		runq_clrbit(rq, pri);
-		if (idx != NULL && *idx == pri)
-			*idx = (pri + 1) % RQ_NQS;
+	idx = runq_findq(rq, 0, RQ_NQS - 1, runq_fuzz_pred, &data);
+	if (idx != -1) {
+		MPASS(data.td != NULL);
+		CTR2(KTR_RUNQ, "runq_choose_fuzz: idx=%d td=%p", idx, data.td);
+		return (data.td);
 	}
+
+	MPASS(data.td == NULL);
+	CTR0(KTR_RUNQ, "runq_choose_fuzz: idlethread");
+	return (NULL);
 }
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -48,6 +48,7 @@
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
+#include <sys/runq.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
@@ -72,15 +73,17 @@
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
-#define	ESTCPULIM(e) \
-    min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
-    RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #ifdef SMP
 #define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
 #else
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #endif
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
+#define	ESTCPULIM(e)							\
+	min((e), INVERSE_ESTCPU_WEIGHT *				\
+	    (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) +			\
+	    PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE)			\
+	    + INVERSE_ESTCPU_WEIGHT - 1)
 
 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
 
@@ -683,13 +686,14 @@
 	/* Nothing needed. */
 }
 
-int
+bool
 sched_runnable(void)
 {
 #ifdef SMP
-	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
+	return (runq_not_empty(&runq) ||
+	    runq_not_empty(&runq_pcpu[PCPU_GET(cpuid)]));
 #else
-	return runq_check(&runq);
+	return (runq_not_empty(&runq));
 #endif
 }
 
@@ -871,7 +875,7 @@
 	if (td->td_priority == prio)
 		return;
 	td->td_priority = prio;
-	if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
+	if (TD_ON_RUNQ(td) && td->td_rqindex != RQ_PRI_TO_QUEUE_IDX(prio)) {
 		sched_rem(td);
 		sched_add(td, SRQ_BORING | SRQ_HOLDTD);
 	}
@@ -1682,7 +1686,7 @@
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
-		while (sched_runnable() == 0) {
+		while (!sched_runnable()) {
 			cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
 			stat->idlecalls++;
 		}
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -52,6 +52,7 @@
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
+#include <sys/runq.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
@@ -87,10 +88,9 @@
  * Thread scheduler specific section.  All fields are protected
  * by the thread lock.
  */
-struct td_sched {	
-	struct runq	*ts_runq;	/* Run-queue we're queued on. */
+struct td_sched {
 	short		ts_flags;	/* TSF_* flags. */
-	int		ts_cpu;		/* CPU that we have affinity for. */
+	int		ts_cpu;		/* CPU we are on, or were last on. */
 	int		ts_rltick;	/* Real last tick, for affinity. */
 	int		ts_slice;	/* Ticks of slice remaining. */
 	u_int		ts_slptime;	/* Number of ticks we vol. slept */
@@ -130,23 +130,6 @@
 #define	PRI_MIN_BATCH		(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
 #define	PRI_MAX_BATCH		PRI_MAX_TIMESHARE
 
-/*
- * Cpu percentage computation macros and defines.
- *
- * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
- * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
- * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
- * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
- * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
- * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
- */
-#define	SCHED_TICK_SECS		10
-#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
-#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
-#define	SCHED_TICK_SHIFT	10
-#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
-#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
-
 /*
  * These macros determine priorities for non-interactive threads.  They are
  * assigned a priority based on their recent cpu utilization as expressed
@@ -169,6 +152,48 @@
     (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
 #define	SCHED_PRI_NICE(nice)	(nice)
 
+/*
+ * Runqueue indices for the implemented scheduling policies' priority bounds.
+ *
+ * In ULE's implementation, realtime policy covers the ITHD, REALTIME and
+ * INTERACT (see above) ranges, timesharing the BATCH range (see above), and
+ * idle policy the IDLE range.
+ *
+ * Priorities from these ranges must not be assigned to the same runqueue's
+ * queue.
+ */
+#define	RQ_RT_POL_MIN		(RQ_PRI_TO_QUEUE_IDX(PRI_MIN_ITHD))
+#define	RQ_RT_POL_MAX		(RQ_PRI_TO_QUEUE_IDX(PRI_MAX_INTERACT))
+#define	RQ_TS_POL_MIN		(RQ_PRI_TO_QUEUE_IDX(PRI_MIN_BATCH))
+#define	RQ_TS_POL_MAX		(RQ_PRI_TO_QUEUE_IDX(PRI_MAX_BATCH))
+#define	RQ_ID_POL_MIN		(RQ_PRI_TO_QUEUE_IDX(PRI_MIN_IDLE))
+#define	RQ_ID_POL_MAX		(RQ_PRI_TO_QUEUE_IDX(PRI_MAX_IDLE))
+
+_Static_assert(RQ_RT_POL_MAX != RQ_TS_POL_MIN,
+    "ULE's realtime and timeshare policies' runqueue ranges overlap");
+_Static_assert(RQ_TS_POL_MAX != RQ_ID_POL_MIN,
+    "ULE's timeshare and idle policies' runqueue ranges overlap");
+
+/* Helper to treat the timeshare range as a circular group of queues. */
+#define RQ_TS_POL_MODULO	(RQ_TS_POL_MAX - RQ_TS_POL_MIN + 1)
+
+/*
+ * Cpu percentage computation macros and defines.
+ *
+ * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
+ * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
+ * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
+ * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
+ * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
+ * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
+ */
+#define	SCHED_TICK_SECS		10
+#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
+#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
+#define	SCHED_TICK_SHIFT	10
+#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
+#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
+
 /*
  * These determine the interactivity of a process.  Interactivity differs from
  * cpu utilization in that it expresses the voluntary time slept vs time ran
@@ -252,12 +277,10 @@
 	short		tdq_oldswitchcnt; /* (l) Switches last tick. */
 	u_char		tdq_lowpri;	/* (ts) Lowest priority thread. */
 	u_char		tdq_owepreempt;	/* (f) Remote preemption pending. */
-	u_char		tdq_idx;	/* (t) Current insert index. */
-	u_char		tdq_ridx;	/* (t) Current removal index. */
+	u_char		tdq_ts_off;	/* (t) TS insertion offset. */
+	u_char		tdq_ts_deq_off;	/* (t) TS dequeue offset. */
 	int		tdq_id;		/* (c) cpuid. */
-	struct runq	tdq_realtime;	/* (t) real-time run queue. */
-	struct runq	tdq_timeshare;	/* (t) timeshare run queue. */
-	struct runq	tdq_idle;	/* (t) Queue of IDLE threads. */
+	struct runq	tdq_runq;	/* (t) Run queue. */
 	char		tdq_name[TDQ_NAME_LEN];
 #ifdef KTR
 	char		tdq_loadname[TDQ_LOADNAME_LEN];
@@ -329,12 +352,17 @@
 static void sched_pctcpu_update(struct td_sched *, int);
 
 /* Operations on per processor queues */
+static inline struct thread *runq_choose_realtime(struct runq *const rq);
+static inline struct thread *runq_choose_timeshare(struct runq *const rq,
+    int off);
+static inline struct thread *runq_choose_idle(struct runq *const rq);
 static struct thread *tdq_choose(struct tdq *);
+
 static void tdq_setup(struct tdq *, int i);
 static void tdq_load_add(struct tdq *, struct thread *);
 static void tdq_load_rem(struct tdq *, struct thread *);
-static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
-static __inline void tdq_runq_rem(struct tdq *, struct thread *);
+static inline void tdq_runq_add(struct tdq *, struct thread *, int);
+static inline void tdq_runq_rem(struct tdq *, struct thread *);
 static inline int sched_shouldpreempt(int, int, int);
 static void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
@@ -343,8 +371,19 @@
 static int tdq_move(struct tdq *, struct tdq *);
 static int tdq_idled(struct tdq *);
 static void tdq_notify(struct tdq *, int lowpri);
+
+static bool runq_steal_pred(const int idx, struct rq_queue *const q,
+    void *const data);
+static inline struct thread *runq_steal_range(struct runq *const rq,
+    const int lvl_min, const int lvl_max, int cpu);
+static inline struct thread *runq_steal_realtime(struct runq *const rq,
+    int cpu);
+static inline struct thread *runq_steal_timeshare(struct runq *const rq,
+    int cpu, int off);
+static inline struct thread *runq_steal_idle(struct runq *const rq,
+    int cpu);
 static struct thread *tdq_steal(struct tdq *, int);
-static struct thread *runq_steal(struct runq *, int);
+
 static int sched_pickcpu(struct thread *, int);
 static void sched_balance(void);
 static bool sched_balance_pair(struct tdq *, struct tdq *);
@@ -386,20 +425,20 @@
 static void
 runq_print(struct runq *rq)
 {
-	struct rqhead *rqh;
+	struct rq_queue *rqq;
 	struct thread *td;
 	int pri;
 	int j;
 	int i;
 
-	for (i = 0; i < RQB_LEN; i++) {
+	for (i = 0; i < RQSW_NB; i++) {
 		printf("\t\trunq bits %d 0x%zx\n",
-		    i, rq->rq_status.rqb_bits[i]);
-		for (j = 0; j < RQB_BPW; j++)
-			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
-				pri = j + (i << RQB_L2BPW);
-				rqh = &rq->rq_queues[pri];
-				TAILQ_FOREACH(td, rqh, td_runq) {
+		    i, rq->rq_status.rq_sw[i]);
+		for (j = 0; j < RQSW_BPW; j++)
+			if (rq->rq_status.rq_sw[i] & (1ul << j)) {
+				pri = RQSW_TO_QUEUE_IDX(i, j);
+				rqq = &rq->rq_queues[pri];
+				TAILQ_FOREACH(td, rqq, td_runq) {
 					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
 					    td, td->td_name, td->td_priority,
 					    td->td_rqindex, pri);
@@ -419,21 +458,17 @@
 	tdq = TDQ_CPU(cpu);
 
 	printf("tdq %d:\n", TDQ_ID(tdq));
-	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
-	printf("\tLock name:      %s\n", tdq->tdq_name);
-	printf("\tload:           %d\n", tdq->tdq_load);
-	printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
-	printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
-	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
-	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
+	printf("\tlock               %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tLock name:         %s\n", tdq->tdq_name);
+	printf("\tload:              %d\n", tdq->tdq_load);
+	printf("\tswitch cnt:        %d\n", tdq->tdq_switchcnt);
+	printf("\told switch cnt:    %d\n", tdq->tdq_oldswitchcnt);
+	printf("\tTS insert offset:  %d\n", tdq->tdq_ts_off);
+	printf("\tTS dequeue offset: %d\n", tdq->tdq_ts_deq_off);
 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
 	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
-	printf("\trealtime runq:\n");
-	runq_print(&tdq->tdq_realtime);
-	printf("\ttimeshare runq:\n");
-	runq_print(&tdq->tdq_timeshare);
-	printf("\tidle runq:\n");
-	runq_print(&tdq->tdq_idle);
+	printf("\trunq:\n");
+	runq_print(&tdq->tdq_runq);
 }
 
 static inline int
@@ -474,11 +509,11 @@
  * date with what is actually on the run-queue.  Selects the correct
  * queue position for timeshare threads.
  */
-static __inline void
+static inline void
 tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
 {
 	struct td_sched *ts;
-	u_char pri;
+	u_char pri, idx;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
@@ -490,62 +525,68 @@
 		tdq->tdq_transferable++;
 		ts->ts_flags |= TSF_XFERABLE;
 	}
-	if (pri < PRI_MIN_BATCH) {
-		ts->ts_runq = &tdq->tdq_realtime;
-	} else if (pri <= PRI_MAX_BATCH) {
-		ts->ts_runq = &tdq->tdq_timeshare;
-		KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
-			("Invalid priority %d on timeshare runq", pri));
+	if (PRI_MIN_BATCH <= pri && pri <= PRI_MAX_BATCH) {
 		/*
-		 * This queue contains only priorities between MIN and MAX
-		 * batch.  Use the whole queue to represent these values.
+		 * The queues allocated to the batch range are not used as
+		 * a simple array but as a "circular" one where the insertion
+		 * index (derived from 'pri') is offset by 'tdq_ts_off'. 'idx'
+		 * is first set to the offset of the wanted queue in the TS'
+		 * selection policy range.
 		 */
-		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
-			pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE;
-			pri = (pri + tdq->tdq_idx) % RQ_NQS;
+		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) != 0)
+			/* Current queue from which processes are being run. */
+			idx = tdq->tdq_ts_deq_off;
+		else {
+			idx = (RQ_PRI_TO_QUEUE_IDX(pri) - RQ_TS_POL_MIN +
+			    tdq->tdq_ts_off) % RQ_TS_POL_MODULO;
 			/*
-			 * This effectively shortens the queue by one so we
-			 * can have a one slot difference between idx and
-			 * ridx while we wait for threads to drain.
+			 * We avoid enqueuing low priority threads in the queue
+			 * that we are still draining, effectively shortening
+			 * the runqueue by one queue.
 			 */
-			if (tdq->tdq_ridx != tdq->tdq_idx &&
-			    pri == tdq->tdq_ridx)
-				pri = (unsigned char)(pri - 1) % RQ_NQS;
-		} else
-			pri = tdq->tdq_ridx;
-		runq_add_pri(ts->ts_runq, td, pri, flags);
-		return;
+			if (tdq->tdq_ts_deq_off != tdq->tdq_ts_off &&
+			    idx == tdq->tdq_ts_deq_off)
+				/* Ensure the dividend is positive. */
+				idx = (idx - 1 + RQ_TS_POL_MODULO) %
+				    RQ_TS_POL_MODULO;
+		}
+		/* Absolute queue index. */
+		idx += RQ_TS_POL_MIN;
+		runq_add_idx(&tdq->tdq_runq, td, idx, flags);
 	} else
-		ts->ts_runq = &tdq->tdq_idle;
-	runq_add(ts->ts_runq, td, flags);
+		runq_add(&tdq->tdq_runq, td, flags);
 }
 
-/* 
+/*
  * Remove a thread from a run-queue.  This typically happens when a thread
  * is selected to run.  Running threads are not on the queue and the
  * transferable count does not reflect them.
  */
-static __inline void
+static inline void
 tdq_runq_rem(struct tdq *tdq, struct thread *td)
 {
 	struct td_sched *ts;
+	bool queue_empty;
 
 	ts = td_get_sched(td);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
-	KASSERT(ts->ts_runq != NULL,
-	    ("tdq_runq_remove: thread %p null ts_runq", td));
 	if (ts->ts_flags & TSF_XFERABLE) {
 		tdq->tdq_transferable--;
 		ts->ts_flags &= ~TSF_XFERABLE;
 	}
-	if (ts->ts_runq == &tdq->tdq_timeshare) {
-		if (tdq->tdq_idx != tdq->tdq_ridx)
-			runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
-		else
-			runq_remove_idx(ts->ts_runq, td, NULL);
-	} else
-		runq_remove(ts->ts_runq, td);
+	queue_empty = runq_remove(&tdq->tdq_runq, td);
+	/*
+	 * If thread has a batch priority and the queue from which it was
+	 * removed is now empty, advance the batch's queue removal index if it
+	 * lags with respect to the batch's queue insertion index.
+	 */
+	if (queue_empty && PRI_MIN_BATCH <= td->td_priority &&
+	    td->td_priority <= PRI_MAX_BATCH &&
+	    tdq->tdq_ts_off != tdq->tdq_ts_deq_off &&
+	    tdq->tdq_ts_deq_off == td->td_rqindex)
+		tdq->tdq_ts_deq_off = (tdq->tdq_ts_deq_off + 1) %
+		    RQ_TS_POL_MODULO;
 }
 
 /*
@@ -1178,82 +1219,84 @@
 	ipi_cpu(cpu, IPI_PREEMPT);
 }
 
+struct runq_steal_pred_data {
+	struct thread	*td;
+	int		cpu;
+};
+
+static bool
+runq_steal_pred(const int idx, struct rq_queue *const q, void *const data)
+{
+	struct runq_steal_pred_data *const d = data;
+	struct thread *td;
+
+	TAILQ_FOREACH(td, q, td_runq) {
+		if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, d->cpu)) {
+			d->td = td;
+			return (true);
+		}
+	}
+
+	return (false);
+}
+
+/*
+ * Steals load contained in queues with indices in the specified range.
+ */
+static inline struct thread *
+runq_steal_range(struct runq *const rq, const int lvl_min, const int lvl_max,
+    int cpu)
+{
+	struct runq_steal_pred_data data = {
+		.td = NULL,
+		.cpu = cpu,
+	};
+	int idx;
+
+	idx = runq_findq(rq, lvl_min, lvl_max, &runq_steal_pred, &data);
+	if (idx != -1) {
+		MPASS(data.td != NULL);
+		return (data.td);
+	}
+
+	MPASS(data.td == NULL);
+	return (NULL);
+}
+
+static inline struct thread *
+runq_steal_realtime(struct runq *const rq, int cpu)
+{
+
+	return (runq_steal_range(rq, RQ_RT_POL_MIN, RQ_RT_POL_MAX, cpu));
+}
+
 /*
  * Steals load from a timeshare queue.  Honors the rotating queue head
  * index.
  */
-static struct thread *
-runq_steal_from(struct runq *rq, int cpu, u_char start)
+static inline struct thread *
+runq_steal_timeshare(struct runq *const rq, int cpu, int off)
 {
-	struct rqbits *rqb;
-	struct rqhead *rqh;
-	struct thread *td, *first;
-	int bit;
-	int i;
-
-	rqb = &rq->rq_status;
-	bit = start & (RQB_BPW -1);
-	first = NULL;
-again:
-	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
-		if (rqb->rqb_bits[i] == 0)
-			continue;
-		if (bit == 0)
-			bit = RQB_FFS(rqb->rqb_bits[i]);
-		for (; bit < RQB_BPW; bit++) {
-			if ((rqb->rqb_bits[i] & (1ul << bit)) == 0)
-				continue;
-			rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)];
-			TAILQ_FOREACH(td, rqh, td_runq) {
-				if (first) {
-					if (THREAD_CAN_MIGRATE(td) &&
-					    THREAD_CAN_SCHED(td, cpu))
-						return (td);
-				} else
-					first = td;
-			}
-		}
-	}
-	if (start != 0) {
-		start = 0;
-		goto again;
-	}
-
-	if (first && THREAD_CAN_MIGRATE(first) &&
-	    THREAD_CAN_SCHED(first, cpu))
-		return (first);
-	return (NULL);
-}
-
-/*
- * Steals load from a standard linear queue.
- */
-static struct thread *
-runq_steal(struct runq *rq, int cpu)
-{
-	struct rqhead *rqh;
-	struct rqbits *rqb;
 	struct thread *td;
-	int word;
-	int bit;
 
-	rqb = &rq->rq_status;
-	for (word = 0; word < RQB_LEN; word++) {
-		if (rqb->rqb_bits[word] == 0)
-			continue;
-		for (bit = 0; bit < RQB_BPW; bit++) {
-			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
-				continue;
-			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
-			TAILQ_FOREACH(td, rqh, td_runq)
-				if (THREAD_CAN_MIGRATE(td) &&
-				    THREAD_CAN_SCHED(td, cpu))
-					return (td);
-		}
-	}
-	return (NULL);
+	MPASS(0 <= off && off < RQ_TS_POL_MODULO);
+
+	td = runq_steal_range(rq, RQ_TS_POL_MIN + off, RQ_TS_POL_MAX, cpu);
+	if (td != NULL || off == 0)
+		return (td);
+
+	td = runq_steal_range(rq, RQ_TS_POL_MIN, RQ_TS_POL_MIN + off - 1, cpu);
+	return (td);
 }
 
+static inline struct thread *
+runq_steal_idle(struct runq *const rq, int cpu)
+{
+
+	return (runq_steal_range(rq, RQ_ID_POL_MIN, RQ_ID_POL_MAX, cpu));
+}
+
+
 /*
  * Attempt to steal a thread in priority order from a thread queue.
  */
@@ -1263,12 +1306,13 @@
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
-	if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
+	td = runq_steal_realtime(&tdq->tdq_runq, cpu);
+	if (td != NULL)
 		return (td);
-	if ((td = runq_steal_from(&tdq->tdq_timeshare,
-	    cpu, tdq->tdq_ridx)) != NULL)
+	td = runq_steal_timeshare(&tdq->tdq_runq, cpu, tdq->tdq_ts_deq_off);
+	if (td != NULL)
 		return (td);
-	return (runq_steal(&tdq->tdq_idle, cpu));
+	return (runq_steal_idle(&tdq->tdq_runq, cpu));
 }
 
 /*
@@ -1450,6 +1494,35 @@
 }
 #endif
 
+static inline struct thread *
+runq_choose_realtime(struct runq *const rq)
+{
+
+	return (runq_first_thread_range(rq, RQ_RT_POL_MIN, RQ_RT_POL_MAX));
+}
+
+static struct thread *
+runq_choose_timeshare(struct runq *const rq, int off)
+{
+	struct thread *td;
+
+	MPASS(0 <= off && off < RQ_TS_POL_MODULO);
+
+	td = runq_first_thread_range(rq, RQ_TS_POL_MIN + off, RQ_TS_POL_MAX);
+	if (td != NULL || off == 0)
+		return (td);
+
+	td = runq_first_thread_range(rq, RQ_TS_POL_MIN, RQ_TS_POL_MIN + off - 1);
+	return (td);
+}
+
+static inline struct thread *
+runq_choose_idle(struct runq *const rq)
+{
+
+	return (runq_first_thread_range(rq, RQ_ID_POL_MIN, RQ_ID_POL_MAX));
+}
+
 /*
  * Pick the highest priority task we have and return it.
  */
@@ -1459,17 +1532,17 @@
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
-	td = runq_choose(&tdq->tdq_realtime);
+	td = runq_choose_realtime(&tdq->tdq_runq);
 	if (td != NULL)
 		return (td);
-	td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
+	td = runq_choose_timeshare(&tdq->tdq_runq, tdq->tdq_ts_deq_off);
 	if (td != NULL) {
 		KASSERT(td->td_priority >= PRI_MIN_BATCH,
 		    ("tdq_choose: Invalid priority on timeshare queue %d",
 		    td->td_priority));
 		return (td);
 	}
-	td = runq_choose(&tdq->tdq_idle);
+	td = runq_choose_idle(&tdq->tdq_runq);
 	if (td != NULL) {
 		KASSERT(td->td_priority >= PRI_MIN_IDLE,
 		    ("tdq_choose: Invalid priority on idle queue %d",
@@ -1489,9 +1562,7 @@
 
 	if (bootverbose)
 		printf("ULE: setup cpu %d\n", id);
-	runq_init(&tdq->tdq_realtime);
-	runq_init(&tdq->tdq_timeshare);
-	runq_init(&tdq->tdq_idle);
+	runq_init(&tdq->tdq_runq);
 	tdq->tdq_id = id;
 	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
 	    "sched lock %d", (int)TDQ_ID(tdq));
@@ -2595,13 +2666,14 @@
 	tdq->tdq_switchcnt = tdq->tdq_load;
 
 	/*
-	 * Advance the insert index once for each tick to ensure that all
+	 * Advance the insert offset once for each tick to ensure that all
 	 * threads get a chance to run.
 	 */
-	if (tdq->tdq_idx == tdq->tdq_ridx) {
-		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
-		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
-			tdq->tdq_ridx = tdq->tdq_idx;
+	if (tdq->tdq_ts_off == tdq->tdq_ts_deq_off) {
+		tdq->tdq_ts_off = (tdq->tdq_ts_off + 1) % RQ_TS_POL_MODULO;
+		if (runq_is_queue_empty(&tdq->tdq_runq,
+		    tdq->tdq_ts_deq_off + RQ_TS_POL_MIN))
+			tdq->tdq_ts_deq_off = tdq->tdq_ts_off;
 	}
 	ts = td_get_sched(td);
 	sched_pctcpu_update(ts, 1);
@@ -2655,24 +2727,20 @@
  * Return whether the current CPU has runnable tasks.  Used for in-kernel
  * cooperative idle threads.
  */
-int
+bool
 sched_runnable(void)
 {
 	struct tdq *tdq;
-	int load;
-
-	load = 1;
 
 	tdq = TDQ_SELF();
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (TDQ_LOAD(tdq) > 0)
-			goto out;
+			return (true);
 	} else
 		if (TDQ_LOAD(tdq) - 1 > 0)
-			goto out;
-	load = 0;
-out:
-	return (load);
+			return (true);
+
+	return (false);
 }
 
 /*
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
--- a/sys/kern/subr_log.c
+++ b/sys/kern/subr_log.c
@@ -47,7 +47,7 @@
 #include <sys/filedesc.h>
 #include <sys/sysctl.h>
 
-#define LOG_RDPRI	(PZERO + 1)
+#define LOG_RDPRI	PZERO
 
 #define LOG_ASYNC	0x04
 
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
--- a/sys/kern/sysv_msg.c
+++ b/sys/kern/sysv_msg.c
@@ -894,7 +894,7 @@
 				we_own_it = 1;
 			}
 			DPRINTF(("msgsnd:  goodnight\n"));
-			error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
+			error = msleep(msqkptr, &msq_mtx, PVFS | PCATCH,
 			    "msgsnd", hz);
 			DPRINTF(("msgsnd:  good morning, error=%d\n", error));
 			if (we_own_it)
@@ -1303,7 +1303,7 @@
 		 */
 
 		DPRINTF(("msgrcv:  goodnight\n"));
-		error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
+		error = msleep(msqkptr, &msq_mtx, PVFS | PCATCH,
 		    "msgrcv", 0);
 		DPRINTF(("msgrcv:  good morning (error=%d)\n", error));
 
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
--- a/sys/kern/sysv_sem.c
+++ b/sys/kern/sysv_sem.c
@@ -1309,7 +1309,7 @@
 			semptr->semncnt++;
 
 		DPRINTF(("semop:  good night!\n"));
-		error = msleep_sbt(semakptr, sema_mtxp, (PZERO - 4) | PCATCH,
+		error = msleep_sbt(semakptr, sema_mtxp, PVFS | PCATCH,
 		    "semwait", sbt, precision, C_ABSOLUTE);
 		DPRINTF(("semop:  good morning (error=%d)!\n", error));
 		/* return code is checked below, after sem[nz]cnt-- */
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -756,7 +756,7 @@
 				break;
 		}
 		error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
-		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
+		    PVFS | slpflag, "newbuf", slptimeo);
 		if (error != 0)
 			break;
 	}
@@ -2654,8 +2654,7 @@
 		mtx_lock(&bdirtylock);
 		while (buf_dirty_count_severe()) {
 			bdirtywait = 1;
-			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
-			    "flswai", 0);
+			msleep(&bdirtywait, &bdirtylock, PVFS, "flswai", 0);
 		}
 		mtx_unlock(&bdirtylock);
 	}
@@ -5234,7 +5233,7 @@
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
 		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
-		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
+		    slpflag | PRIBIO, "bo_wwait", timeo);
 		if (error)
 			break;
 	}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -787,7 +787,7 @@
 		}
 		DROP_GIANT();
 		sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
-		sleepq_wait(&fp->f_vnread_flags, PUSER -1);
+		sleepq_wait(&fp->f_vnread_flags, PRI_MAX_KERN);
 		PICKUP_GIANT();
 		sleepq_lock(&fp->f_vnread_flags);
 		state = atomic_load_16(flagsp);
@@ -849,7 +849,7 @@
 	if ((flags & FOF_NOLOCK) == 0) {
 		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+			msleep(&fp->f_vnread_flags, mtxp, PRI_MAX_KERN,
 			    "vofflock", 0);
 		}
 		fp->f_vnread_flags |= FOFFSET_LOCKED;
@@ -1897,7 +1897,7 @@
 			if (flags & V_PCATCH)
 				mflags |= PCATCH;
 		}
-		mflags |= (PUSER - 1);
+		mflags |= PRI_MAX_KERN;
 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 			if ((flags & V_NOWAIT) != 0) {
 				error = EWOULDBLOCK;
@@ -2022,7 +2022,7 @@
 		if ((flags & V_PCATCH) != 0)
 			mflags |= PCATCH;
 	}
-	mflags |= (PUSER - 1) | PDROP;
+	mflags |= PRI_MAX_KERN | PDROP;
 	error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0);
 	vfs_rel(mp);
 	if (error == 0)
@@ -2107,7 +2107,7 @@
 		return (EALREADY);
 	}
 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
-		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
+		msleep(&mp->mnt_flag, MNT_MTX(mp), PRI_MAX_KERN, "wsuspfs", 0);
 
 	/*
 	 * Unmount holds a write reference on the mount point.  If we
@@ -2128,7 +2128,7 @@
 	mp->mnt_susp_owner = curthread;
 	if (mp->mnt_writeopcount > 0)
 		(void) msleep(&mp->mnt_writeopcount, 
-		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
+		    MNT_MTX(mp), PRI_MAX_KERN | PDROP, "suspwt", 0);
 	else
 		MNT_IUNLOCK(mp);
 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c
--- a/sys/net/if_tuntap.c
+++ b/sys/net/if_tuntap.c
@@ -834,7 +834,7 @@
 		tp->tun_flags &= ~TUN_RWAIT;
 		wakeup(tp);
 	}
-	selwakeuppri(&tp->tun_rsel, PZERO + 1);
+	selwakeuppri(&tp->tun_rsel, PZERO);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) {
 		TUN_UNLOCK(tp);
@@ -1172,7 +1172,7 @@
 	CURVNET_RESTORE();
 
 	funsetown(&tp->tun_sigio);
-	selwakeuppri(&tp->tun_rsel, PZERO + 1);
+	selwakeuppri(&tp->tun_rsel, PZERO);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	TUNDEBUG (ifp, "closed\n");
 	tp->tun_flags &= ~TUN_OPEN;
@@ -1706,7 +1706,7 @@
 			return (EWOULDBLOCK);
 		}
 		tp->tun_flags |= TUN_RWAIT;
-		error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1),
+		error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | PZERO,
 		    "tunread", 0);
 		if (error != 0) {
 			TUN_UNLOCK(tp);
diff --git a/sys/netgraph/ng_device.c b/sys/netgraph/ng_device.c
--- a/sys/netgraph/ng_device.c
+++ b/sys/netgraph/ng_device.c
@@ -462,7 +462,7 @@
 			mtx_lock(&priv->ngd_mtx);
 			priv->flags |= NGDF_RWAIT;
 			if ((error = msleep(priv, &priv->ngd_mtx,
-			    PDROP | PCATCH | (PZERO + 1),
+			    PDROP | PCATCH | PZERO,
 			    "ngdread", 0)) != 0)
 				return (error);
 		}
diff --git a/sys/powerpc/include/runq.h b/sys/powerpc/include/runq.h
deleted file mode 100644
--- a/sys/powerpc/include/runq.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause
- *
- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef	_MACHINE_RUNQ_H_
-#define	_MACHINE_RUNQ_H_
-
-#ifdef __powerpc64__
-#define	RQB_LEN		(1UL)		/* Number of priority status words. */
-#define	RQB_L2BPW	(6UL)		/* Log2(sizeof(rqb_word_t) * NBBY)). */
-#else
-#define	RQB_LEN		(2)		/* Number of priority status words. */
-#define	RQB_L2BPW	(5)		/* Log2(sizeof(rqb_word_t) * NBBY)). */
-#endif
-#define	RQB_BPW		(1UL<<RQB_L2BPW) /* Bits in an rqb_word_t. */
-
-#define	RQB_BIT(pri)	(1UL << ((pri) & (RQB_BPW - 1)))
-#define	RQB_WORD(pri)	((pri) >> RQB_L2BPW)
-
-#define	RQB_FFS(word)	(ffsl(word) - 1)
-
-/*
- * Type of run queue status word.
- */
-#ifdef __powerpc64__
-typedef	u_int64_t	rqb_word_t;
-#else
-typedef	u_int32_t	rqb_word_t;
-#endif
-
-#endif
diff --git a/sys/riscv/include/runq.h b/sys/riscv/include/runq.h
deleted file mode 100644
--- a/sys/riscv/include/runq.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*-
- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef	_MACHINE_RUNQ_H_
-#define	_MACHINE_RUNQ_H_
-
-#define	RQB_LEN		(1)		/* Number of priority status words. */
-#define	RQB_L2BPW	(6)		/* Log2(sizeof(rqb_word_t) * NBBY)). */
-#define	RQB_BPW		(1<<RQB_L2BPW)	/* Bits in an rqb_word_t. */
-
-#define	RQB_BIT(pri)	(1ul << ((pri) & (RQB_BPW - 1)))
-#define	RQB_WORD(pri)	((pri) >> RQB_L2BPW)
-
-#define	RQB_FFS(word)	(ffsl(word) - 1)
-
-/*
- * Type of run queue status word.
- */
-typedef	unsigned long	rqb_word_t;
-
-#endif
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -296,7 +296,7 @@
  * Initialize a lock.
  */
 #define BUF_LOCKINIT(bp, wmesg)						\
-	lockinit(&(bp)->b_lock, PRIBIO + 4, wmesg, 0, LK_NEW)
+	lockinit(&(bp)->b_lock, PVFS, wmesg, 0, LK_NEW)
 /*
  *
  * Get a lock sleeping non-interruptably until it becomes available.
@@ -311,7 +311,7 @@
  */
 #define	BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo)	\
 	_lockmgr_args_rw(&(bp)->b_lock, (locktype) | LK_TIMELOCK,	\
-	    (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo),	\
+	    (interlock), (wmesg), PVFS | (catch), (timo),	\
 	    LOCK_FILE, LOCK_LINE)
 
 /*
diff --git a/sys/sys/param.h b/sys/sys/param.h
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -73,7 +73,7 @@
  * cannot include sys/param.h and should only be updated here.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1500018
+#define __FreeBSD_version 1500019
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
diff --git a/sys/sys/priority.h b/sys/sys/priority.h
--- a/sys/sys/priority.h
+++ b/sys/sys/priority.h
@@ -64,17 +64,23 @@
  */
 
 /*
- * Priorities range from 0 to 255, but differences of less then 4 (RQ_PPQ)
- * are insignificant.  Ranges are as follows:
+ * Priorities range from 0 to 255.  Ranges are as follows:
  *
- * Interrupt threads:		0 - 15
- * Realtime user threads:	16 - 47
- * Top half kernel threads:	48 - 87
- * Time sharing user threads:	88 - 223
+ * Interrupt threads:		0 - 7
+ * Realtime user threads:	8 - 39
+ * Top half kernel threads:	40 - 55
+ * Time sharing user threads:	56 - 223
  * Idle user threads:		224 - 255
  *
- * XXX If/When the specific interrupt thread and top half thread ranges
- * disappear, a larger range can be used for user processes.
+ * Priority levels of rtprio(2)'s RTP_PRIO_FIFO and RTP_PRIO_REALTIME and
+ * POSIX's SCHED_FIFO and SCHED_RR are directly mapped to the internal realtime
+ * range mentioned above by a simple translation.  This range's length
+ * consequently cannot be changed without impacts on the scheduling priority
+ * code, and in any case must never be smaller than 32 for POSIX compliance and
+ * rtprio(2) backwards compatibility.  Similarly, priority levels of rtprio(2)'s
+ * RTP_PRIO_IDLE are directly mapped to the internal idle range above (and,
+ * soon, those of the to-be-introduced SCHED_IDLE policy as well), so changing
+ * that range is subject to the same caveats and restrictions.
  */
 
 #define	PRI_MIN			(0)		/* Highest priority. */
@@ -88,34 +94,34 @@
  * decay to lower priorities if they run for full time slices.
  */
 #define	PI_REALTIME		(PRI_MIN_ITHD + 0)
-#define	PI_INTR			(PRI_MIN_ITHD + 4)
+#define	PI_INTR			(PRI_MIN_ITHD + 1)
 #define	PI_AV			PI_INTR
 #define	PI_NET			PI_INTR
 #define	PI_DISK			PI_INTR
 #define	PI_TTY			PI_INTR
 #define	PI_DULL			PI_INTR
-#define	PI_SOFT			(PRI_MIN_ITHD + 8)
+#define	PI_SOFT			(PRI_MIN_ITHD + 2)
 #define	PI_SOFTCLOCK		PI_SOFT
 #define	PI_SWI(x)		PI_SOFT
 
-#define	PRI_MIN_REALTIME	(16)
+#define	PRI_MIN_REALTIME	(8)
 #define	PRI_MAX_REALTIME	(PRI_MIN_KERN - 1)
 
-#define	PRI_MIN_KERN		(48)
+#define	PRI_MIN_KERN		(40)
 #define	PRI_MAX_KERN		(PRI_MIN_TIMESHARE - 1)
 
 #define	PSWP			(PRI_MIN_KERN + 0)
-#define	PVM			(PRI_MIN_KERN + 4)
-#define	PINOD			(PRI_MIN_KERN + 8)
-#define	PRIBIO			(PRI_MIN_KERN + 12)
-#define	PVFS			(PRI_MIN_KERN + 16)
-#define	PZERO			(PRI_MIN_KERN + 20)
-#define	PSOCK			(PRI_MIN_KERN + 24)
-#define	PWAIT			(PRI_MIN_KERN + 28)
-#define	PLOCK			(PRI_MIN_KERN + 32)
-#define	PPAUSE			(PRI_MIN_KERN + 36)
+#define	PVM			(PRI_MIN_KERN + 1)
+#define	PINOD			(PRI_MIN_KERN + 2)
+#define	PRIBIO			(PRI_MIN_KERN + 3)
+#define	PVFS			(PRI_MIN_KERN + 4)
+#define	PZERO			(PRI_MIN_KERN + 5)
+#define	PSOCK			(PRI_MIN_KERN + 6)
+#define	PWAIT			(PRI_MIN_KERN + 7)
+#define	PLOCK			(PRI_MIN_KERN + 8)
+#define	PPAUSE			(PRI_MIN_KERN + 9)
 
-#define	PRI_MIN_TIMESHARE	(88)
+#define	PRI_MIN_TIMESHARE	(56)
 #define	PRI_MAX_TIMESHARE	(PRI_MIN_IDLE - 1)
 
 #define	PUSER			(PRI_MIN_TIMESHARE)
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -53,7 +53,6 @@
 #include <sys/osd.h>
 #include <sys/priority.h>
 #include <sys/rtprio.h>			/* XXX. */
-#include <sys/runq.h>
 #include <sys/resource.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
diff --git a/sys/sys/runq.h b/sys/sys/runq.h
--- a/sys/sys/runq.h
+++ b/sys/sys/runq.h
@@ -29,7 +29,11 @@
 #ifndef	_RUNQ_H_
 #define	_RUNQ_H_
 
-#include <machine/runq.h>
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+#include <sys/types.h>		/* For bool. */
 
 struct thread;
 
@@ -37,20 +41,65 @@
  * Run queue parameters.
  */
 
-#define	RQ_NQS		(64)		/* Number of run queues. */
-#define	RQ_PPQ		(4)		/* Priorities per queue. */
+#define	RQ_MAX_PRIO	(255)	/* Maximum priority (minimum is 0). */
+#define	RQ_PPQ		(1)	/* Priorities per queue. */
 
 /*
- * Head of run queues.
+ * Convenience macros from <sys/param.h>.
  */
-TAILQ_HEAD(rqhead, thread);
+#ifndef	NBBY
+#define	NBBY 8
+#endif
+#ifndef	howmany
+#define	howmany(x, y)	(((x)+((y)-1))/(y))
+#endif
+
+/*
+ * Deduced from the above parameters and machine ones.
+ */
+#define	RQ_NQS	(howmany(RQ_MAX_PRIO + 1, RQ_PPQ)) /* Number of run queues. */
+#define	RQ_PRI_TO_QUEUE_IDX(pri) ((pri) / RQ_PPQ) /* Priority to queue index. */
+
+typedef unsigned long	rqsw_t;		/* runq's status words type. */
+#define	RQSW_BPW	(sizeof(rqsw_t) * NBBY) /* Bits per runq word. */
+#if defined(_LP64)
+#define	RQSW_L2BPW	(6)		/* Log2(sizeof(rqsw_t) * NBBY)). */
+#elif defined(_ILP32)
+#define	RQSW_L2BPW	(5)		/* Log2(sizeof(rqsw_t) * NBBY)). */
+#else
+#error Not _LP64 nor _ILP32!
+#endif
+/*
+ * That RQSW_BPW and RQSW_L2BPW are consistent is checked by a static assertion.
+ */
+
+/* Number of status words to cover RQ_NQS queues. */
+#define	RQSW_NB			(howmany(RQ_NQS, RQSW_BPW))
+#define	RQSW_IDX(idx)		((idx) >> RQSW_L2BPW)
+#define	RQSW_BIT_IDX(idx)	((idx) & (RQSW_BPW - 1))
+#define	RQSW_BIT(idx)		(1ul << RQSW_BIT_IDX(idx))
+#define	RQSW_BSF(word)	({						\
+	int _res = ffsl((long)(word)); /* Assumes two-complement. */	\
+	MPASS(_res > 0);						\
+	_res - 1;							\
+})
+#define	RQSW_TO_QUEUE_IDX(word_idx, bit_idx)				\
+	(((word_idx) << RQSW_L2BPW) + (bit_idx))
+#define	RQSW_FIRST_QUEUE_IDX(word_idx, word)				\
+	RQSW_TO_QUEUE_IDX(word_idx, RQSW_BSF(word))
+
+
+/*
+ * The queue for a given index as a list of threads.
+ */
+TAILQ_HEAD(rq_queue, thread);
 
 /*
  * Bit array which maintains the status of a run queue.  When a queue is
  * non-empty the bit corresponding to the queue number will be set.
  */
-struct rqbits {
-	rqb_word_t rqb_bits[RQB_LEN];
+struct rq_status {
+	rqsw_t rq_sw[RQSW_NB];
 };
 
 /*
@@ -58,18 +107,29 @@
  * are placed, and a structure to maintain the status of each queue.
  */
 struct runq {
-	struct	rqbits rq_status;
-	struct	rqhead rq_queues[RQ_NQS];
+	struct rq_status	rq_status;
+	struct rq_queue		rq_queues[RQ_NQS];
 };
 
-void	runq_add(struct runq *, struct thread *, int);
-void	runq_add_pri(struct runq *, struct thread *, u_char, int);
-int	runq_check(struct runq *);
-struct	thread *runq_choose(struct runq *);
-struct	thread *runq_choose_from(struct runq *, u_char);
-struct	thread *runq_choose_fuzz(struct runq *, int);
 void	runq_init(struct runq *);
-void	runq_remove(struct runq *, struct thread *);
-void	runq_remove_idx(struct runq *, struct thread *, u_char *);
+bool	runq_is_queue_empty(struct runq *, int _idx);
+void	runq_add(struct runq *, struct thread *, int _flags);
+void	runq_add_idx(struct runq *, struct thread *, int _idx, int _flags);
+bool	runq_remove(struct runq *, struct thread *);
+
+/*
+ * Implementation helpers for common and scheduler-specific runq_choose*()
+ * functions.
+ */
+typedef bool	 runq_pred_t(int _idx, struct rq_queue *, void *_data);
+int		 runq_findq(struct runq *const rq, const int lvl_min,
+		    const int lvl_max,
+		    runq_pred_t *const pred, void *const pred_data);
+struct thread	*runq_first_thread_range(struct runq *const rq,
+		    const int lvl_min, const int lvl_max);
+
+bool		 runq_not_empty(struct runq *);
+struct thread	*runq_choose(struct runq *);
+struct thread	*runq_choose_fuzz(struct runq *, int _fuzz);
 
 #endif
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -63,6 +63,9 @@
 #define	_SCHED_H_
 
 #ifdef _KERNEL
+
+#include <sys/types.h>		/* For bool. */
+
 /*
  * General scheduling info.
  *
@@ -74,7 +77,7 @@
  */
 int	sched_load(void);
 int	sched_rr_interval(void);
-int	sched_runnable(void);
+bool	sched_runnable(void);
 
 /* 
  * Proc related scheduling hooks.
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -497,7 +497,7 @@
 	while (mp->mnt_secondary_writes != 0) {
 		BO_UNLOCK(bo);
 		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
-		    (PUSER - 1) | PDROP, "secwr", 0);
+		    PRI_MAX_KERN | PDROP, "secwr", 0);
 		BO_LOCK(bo);
 		MNT_ILOCK(mp);
 	}
@@ -14561,7 +14561,7 @@
 		while (mp->mnt_secondary_writes != 0) {
 			BO_UNLOCK(bo);
 			msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
-			    (PUSER - 1) | PDROP, "secwr", 0);
+			    PRI_MAX_KERN | PDROP, "secwr", 0);
 			BO_LOCK(bo);
 			MNT_ILOCK(mp);
 		}
@@ -14601,7 +14601,7 @@
 			BO_UNLOCK(bo);
 			msleep(&mp->mnt_secondary_writes,
 			       MNT_MTX(mp),
-			       (PUSER - 1) | PDROP, "secwr", 0);
+			       PRI_MAX_KERN | PDROP, "secwr", 0);
 			BO_LOCK(bo);
 			continue;
 		}
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -1399,8 +1399,7 @@
 	VI_LOCK(vp);
 	while (ip->i_flag & IN_EA_LOCKED) {
 		UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
-		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
-		    0);
+		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD, "ufs_ea", 0);
 	}
 	UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
 	VI_UNLOCK(vp);
diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c
--- a/sys/ufs/ufs/ufs_quota.c
+++ b/sys/ufs/ufs/ufs_quota.c
@@ -179,7 +179,7 @@
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
 			DQI_LOCK(dq);
-			DQI_WAIT(dq, PINOD+1, "chkdq1");
+			DQI_WAIT(dq, PINOD, "chkdq1");
 			ncurblocks = dq->dq_curblocks + change;
 			if (ncurblocks >= 0)
 				dq->dq_curblocks = ncurblocks;
@@ -201,7 +201,7 @@
 			continue;
 		warn = 0;
 		DQI_LOCK(dq);
-		DQI_WAIT(dq, PINOD+1, "chkdq2");
+		DQI_WAIT(dq, PINOD, "chkdq2");
 		if (do_check) {
 			error = chkdqchg(ip, change, cred, i, &warn);
 			if (error) {
@@ -215,7 +215,7 @@
 					if (dq == NODQUOT)
 						continue;
 					DQI_LOCK(dq);
-					DQI_WAIT(dq, PINOD+1, "chkdq3");
+					DQI_WAIT(dq, PINOD, "chkdq3");
 					ncurblocks = dq->dq_curblocks - change;
 					if (ncurblocks >= 0)
 						dq->dq_curblocks = ncurblocks;
@@ -320,7 +320,7 @@
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
 			DQI_LOCK(dq);
-			DQI_WAIT(dq, PINOD+1, "chkiq1");
+			DQI_WAIT(dq, PINOD, "chkiq1");
 			if (dq->dq_curinodes >= -change)
 				dq->dq_curinodes += change;
 			else
@@ -341,7 +341,7 @@
 			continue;
 		warn = 0;
 		DQI_LOCK(dq);
-		DQI_WAIT(dq, PINOD+1, "chkiq2");
+		DQI_WAIT(dq, PINOD, "chkiq2");
 		if (do_check) {
 			error = chkiqchg(ip, change, cred, i, &warn);
 			if (error) {
@@ -355,7 +355,7 @@
 					if (dq == NODQUOT)
 						continue;
 					DQI_LOCK(dq);
-					DQI_WAIT(dq, PINOD+1, "chkiq3");
+					DQI_WAIT(dq, PINOD, "chkiq3");
 					if (dq->dq_curinodes >= change)
 						dq->dq_curinodes -= change;
 					else
@@ -855,7 +855,7 @@
 		return (error);
 	dq = ndq;
 	DQI_LOCK(dq);
-	DQI_WAIT(dq, PINOD+1, "setqta");
+	DQI_WAIT(dq, PINOD, "setqta");
 	/*
 	 * Copy all but the current values.
 	 * Reset time limit if previously had no soft limit or were
@@ -918,7 +918,7 @@
 		return (error);
 	dq = ndq;
 	DQI_LOCK(dq);
-	DQI_WAIT(dq, PINOD+1, "setuse");
+	DQI_WAIT(dq, PINOD, "setuse");
 	/*
 	 * Reset time limit if have a soft limit and were
 	 * previously under it, but are now over it.
@@ -1314,7 +1314,7 @@
 	if (dq != NULL) {
 		DQH_UNLOCK();
 hfound:		DQI_LOCK(dq);
-		DQI_WAIT(dq, PINOD+1, "dqget");
+		DQI_WAIT(dq, PINOD, "dqget");
 		DQI_UNLOCK(dq);
 		if (dq->dq_ump == NULL) {
 			dqrele(vp, dq);
@@ -1588,7 +1588,7 @@
 		vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY);
 
 	DQI_LOCK(dq);
-	DQI_WAIT(dq, PINOD+2, "dqsync");
+	DQI_WAIT(dq, PINOD, "dqsync");
 	if ((dq->dq_flags & DQ_MOD) == 0)
 		goto out;
 	dq->dq_flags |= DQ_LOCK;
@@ -1742,7 +1742,7 @@
 		if ((dq = qrp[i]) == NODQUOT)
 			continue;
 		DQI_LOCK(dq);
-		DQI_WAIT(dq, PINOD+1, "adjqta");
+		DQI_WAIT(dq, PINOD, "adjqta");
 		ncurblocks = dq->dq_curblocks + blkcount;
 		if (ncurblocks >= 0)
 			dq->dq_curblocks = ncurblocks;
diff --git a/tests/sys/kern/ptrace_test.c b/tests/sys/kern/ptrace_test.c
--- a/tests/sys/kern/ptrace_test.c
+++ b/tests/sys/kern/ptrace_test.c
@@ -34,7 +34,6 @@
 #include <sys/ptrace.h>
 #include <sys/procfs.h>
 #include <sys/queue.h>
-#include <sys/runq.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/user.h>
@@ -2027,7 +2026,7 @@
 		    sched_get_priority_min(SCHED_FIFO)) / 2;
 		CHILD_REQUIRE(pthread_setschedparam(pthread_self(),
 		    SCHED_FIFO, &sched_param) == 0);
-		sched_param.sched_priority -= RQ_PPQ;
+		sched_param.sched_priority -= 1;
 		CHILD_REQUIRE(pthread_setschedparam(t, SCHED_FIFO,
 		    &sched_param) == 0);
 
@@ -2130,7 +2129,7 @@
 		    sched_get_priority_min(SCHED_FIFO)) / 2;
 		CHILD_REQUIRE(pthread_setschedparam(pthread_self(),
 		    SCHED_FIFO, &sched_param) == 0);
-		sched_param.sched_priority -= RQ_PPQ;
+		sched_param.sched_priority -= 1;
 		CHILD_REQUIRE(pthread_setschedparam(t, SCHED_FIFO,
 		    &sched_param) == 0);