Index: head/gnu/usr.bin/gdb/kgdb/trgt_i386.c
===================================================================
--- head/gnu/usr.bin/gdb/kgdb/trgt_i386.c	(revision 332488)
+++ head/gnu/usr.bin/gdb/kgdb/trgt_i386.c	(revision 332489)
@@ -1,400 +1,418 @@
 /*
  * Copyright (c) 2004 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/proc.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
 #include <machine/pcb.h>
 #include <machine/frame.h>
 #include <machine/segments.h>
 #include <machine/tss.h>
 #include <err.h>
 #include <kvm.h>
 #include <string.h>
 
 #include <defs.h>
 #include <target.h>
 #include <gdbthread.h>
 #include <inferior.h>
 #include <regcache.h>
 #include <frame-unwind.h>
 #include <i386-tdep.h>
 
 #include "kgdb.h"
 
 static int ofs_fix;
 
 CORE_ADDR
 kgdb_trgt_core_pcb(u_int cpuid)
 {
 	return (kgdb_trgt_stop_pcb(cpuid, sizeof(struct pcb)));
 }
 
 void
 kgdb_trgt_fetch_registers(int regno __unused)
 {
 	struct kthr *kt;
 	struct pcb pcb;
 
 	kt = kgdb_thr_lookup_tid(ptid_get_pid(inferior_ptid));
 	if (kt == NULL)
 		return;
 	if (kvm_read(kvm, kt->pcb, &pcb, sizeof(pcb)) != sizeof(pcb)) {
 		warnx("kvm_read: %s", kvm_geterr(kvm));
 		memset(&pcb, 0, sizeof(pcb));
 	}
 	supply_register(I386_EBX_REGNUM, (char *)&pcb.pcb_ebx);
 	supply_register(I386_ESP_REGNUM, (char *)&pcb.pcb_esp);
 	supply_register(I386_EBP_REGNUM, (char *)&pcb.pcb_ebp);
 	supply_register(I386_ESI_REGNUM, (char *)&pcb.pcb_esi);
 	supply_register(I386_EDI_REGNUM, (char *)&pcb.pcb_edi);
 	supply_register(I386_EIP_REGNUM, (char *)&pcb.pcb_eip);
 }
 
 void
 kgdb_trgt_store_registers(int regno __unused)
 {
 	fprintf_unfiltered(gdb_stderr, "XXX: %s\n", __func__);
 }
 
 void
 kgdb_trgt_new_objfile(struct objfile *objfile)
 {
 
 	/*
 	 * In revision 1.117 of i386/i386/exception.S trap handlers
 	 * were changed to pass trapframes by reference rather than
 	 * by value.  Detect this by seeing if the first instruction
 	 * at the 'calltrap' label is a "push %esp" which has the
 	 * opcode 0x54.
 	 */
 	if (kgdb_parse("((char *)calltrap)[0]") == 0x54)
 		ofs_fix = 4;
 	else
 		ofs_fix = 0;
 }
 
 struct kgdb_tss_cache {
 	CORE_ADDR	pc;
 	CORE_ADDR	sp;
 	CORE_ADDR	tss;
 };
 
 static int kgdb_trgt_tss_offset[15] = {
 	offsetof(struct i386tss, tss_eax),
 	offsetof(struct i386tss, tss_ecx),
 	offsetof(struct i386tss, tss_edx),
 	offsetof(struct i386tss, tss_ebx),
 	offsetof(struct i386tss, tss_esp),
 	offsetof(struct i386tss, tss_ebp),
 	offsetof(struct i386tss, tss_esi),
 	offsetof(struct i386tss, tss_edi),
 	offsetof(struct i386tss, tss_eip),
 	offsetof(struct i386tss, tss_eflags),
 	offsetof(struct i386tss, tss_cs),
 	offsetof(struct i386tss, tss_ss),
 	offsetof(struct i386tss, tss_ds),
 	offsetof(struct i386tss, tss_es),
 	offsetof(struct i386tss, tss_fs)
 };
 
 /*
  * If the current thread is executing on a CPU, fetch the common_tss
  * for that CPU.
  *
  * This is painful because 'struct pcpu' is variant sized, so we can't
  * use it.  Instead, we lookup the GDT selector for this CPU and
  * extract the base of the TSS from there.
  */
 static CORE_ADDR
 kgdb_trgt_fetch_tss(void)
 {
 	struct kthr *kt;
 	struct segment_descriptor sd;
 	uintptr_t addr, cpu0prvpage, tss;
 
 	kt = kgdb_thr_lookup_tid(ptid_get_pid(inferior_ptid));
 	if (kt == NULL || kt->cpu == NOCPU || kt->cpu < 0)
 		return (0);
 
 	addr = kgdb_lookup("gdt");
 	if (addr == 0)
 		return (0);
 	addr += (kt->cpu * NGDT + GPROC0_SEL) * sizeof(sd);
 	if (kvm_read(kvm, addr, &sd, sizeof(sd)) != sizeof(sd)) {
 		warnx("kvm_read: %s", kvm_geterr(kvm));
 		return (0);
 	}
 	if (sd.sd_type != SDT_SYS386BSY) {
 		warnx("descriptor is not a busy TSS");
 		return (0);
 	}
 	tss = sd.sd_hibase << 24 | sd.sd_lobase;
 
 	/*
 	 * In SMP kernels, the TSS is stored as part of the per-CPU
 	 * data.  On older kernels, the CPU0's private page
 	 * is stored at an address that isn't mapped in minidumps.
 	 * However, the data is mapped at the alternate cpu0prvpage
 	 * address.  Thus, if the TSS is at the invalid address,
 	 * change it to be relative to cpu0prvpage instead.
 	 */ 
 	if (trunc_page(tss) == 0xffc00000) {
 		addr = kgdb_lookup("cpu0prvpage");
 		if (addr == 0)
 			return (0);
 		if (kvm_read(kvm, addr, &cpu0prvpage, sizeof(cpu0prvpage)) !=
 		    sizeof(cpu0prvpage)) {
 			warnx("kvm_read: %s", kvm_geterr(kvm));
 			return (0);
 		}
 		tss = cpu0prvpage + (tss & PAGE_MASK);
 	}
 	return ((CORE_ADDR)tss);
 }
 
 static struct kgdb_tss_cache *
 kgdb_trgt_tss_cache(struct frame_info *next_frame, void **this_cache)
 {
 	char buf[MAX_REGISTER_SIZE];
 	struct kgdb_tss_cache *cache;
 
 	cache = *this_cache;
 	if (cache == NULL) {
 		cache = FRAME_OBSTACK_ZALLOC(struct kgdb_tss_cache);
 		*this_cache = cache;
 		cache->pc = frame_func_unwind(next_frame);
 		frame_unwind_register(next_frame, SP_REGNUM, buf);
 		cache->sp = extract_unsigned_integer(buf,
 		    register_size(current_gdbarch, SP_REGNUM));
 		cache->tss = kgdb_trgt_fetch_tss();
 	}
 	return (cache);
 }
 
 static void
 kgdb_trgt_dblfault_this_id(struct frame_info *next_frame, void **this_cache,
     struct frame_id *this_id)
 {
 	struct kgdb_tss_cache *cache;
 
 	cache = kgdb_trgt_tss_cache(next_frame, this_cache);
 	*this_id = frame_id_build(cache->sp, cache->pc);
 }
 
 static void
 kgdb_trgt_dblfault_prev_register(struct frame_info *next_frame,
     void **this_cache, int regnum, int *optimizedp, enum lval_type *lvalp,
     CORE_ADDR *addrp, int *realnump, void *valuep)
 {
 	char dummy_valuep[MAX_REGISTER_SIZE];
 	struct kgdb_tss_cache *cache;
 	int ofs, regsz;
 
 	regsz = register_size(current_gdbarch, regnum);
 
 	if (valuep == NULL)
 		valuep = dummy_valuep;
 	memset(valuep, 0, regsz);
 	*optimizedp = 0;
 	*addrp = 0;
 	*lvalp = not_lval;
 	*realnump = -1;
 
 	ofs = (regnum >= I386_EAX_REGNUM && regnum <= I386_FS_REGNUM)
 	    ? kgdb_trgt_tss_offset[regnum] : -1;
 	if (ofs == -1)
 		return;
 
 	cache = kgdb_trgt_tss_cache(next_frame, this_cache);
 	if (cache->tss == 0)
 		return;
 	*addrp = cache->tss + ofs;
 	*lvalp = lval_memory;
 	target_read_memory(*addrp, valuep, regsz);
 }
 
 static const struct frame_unwind kgdb_trgt_dblfault_unwind = {
         UNKNOWN_FRAME,
         &kgdb_trgt_dblfault_this_id,
         &kgdb_trgt_dblfault_prev_register
 };
 
 struct kgdb_frame_cache {
 	int		frame_type;
 	CORE_ADDR	pc;
 	CORE_ADDR	sp;
 };
 #define	FT_NORMAL		1
 #define	FT_INTRFRAME		2
 #define	FT_INTRTRAPFRAME	3
 #define	FT_TIMERFRAME		4
 
 static int kgdb_trgt_frame_offset[15] = {
 	offsetof(struct trapframe, tf_eax),
 	offsetof(struct trapframe, tf_ecx),
 	offsetof(struct trapframe, tf_edx),
 	offsetof(struct trapframe, tf_ebx),
 	offsetof(struct trapframe, tf_esp),
 	offsetof(struct trapframe, tf_ebp),
 	offsetof(struct trapframe, tf_esi),
 	offsetof(struct trapframe, tf_edi),
 	offsetof(struct trapframe, tf_eip),
 	offsetof(struct trapframe, tf_eflags),
 	offsetof(struct trapframe, tf_cs),
 	offsetof(struct trapframe, tf_ss),
 	offsetof(struct trapframe, tf_ds),
 	offsetof(struct trapframe, tf_es),
 	offsetof(struct trapframe, tf_fs)
 };
 
 static struct kgdb_frame_cache *
 kgdb_trgt_frame_cache(struct frame_info *next_frame, void **this_cache)
 {
 	char buf[MAX_REGISTER_SIZE];
 	struct kgdb_frame_cache *cache;
 	char *pname;
+	CORE_ADDR pcx;
+	uintptr_t addr, setidt_disp;
 
 	cache = *this_cache;
 	if (cache == NULL) {
 		cache = FRAME_OBSTACK_ZALLOC(struct kgdb_frame_cache);
 		*this_cache = cache;
-		cache->pc = frame_func_unwind(next_frame);
+		pcx = frame_pc_unwind(next_frame);
+		if (pcx >= PMAP_TRM_MIN_ADDRESS) {
+			addr = kgdb_lookup("setidt_disp");
+			if (addr != 0) {
+				if (kvm_read(kvm, addr, &setidt_disp,
+				    sizeof(setidt_disp)) !=
+				    sizeof(setidt_disp))
+					warnx("kvm_read: %s", kvm_geterr(kvm));
+				else
+					pcx -= setidt_disp;
+			}
+		}
+		cache->pc = pcx;
 		find_pc_partial_function(cache->pc, &pname, NULL, NULL);
 		if (pname[0] != 'X')
 			cache->frame_type = FT_NORMAL;
 		else if (strcmp(pname, "Xtimerint") == 0)
 			cache->frame_type = FT_TIMERFRAME;
 		else if (strcmp(pname, "Xcpustop") == 0 ||
 		    strcmp(pname, "Xrendezvous") == 0 ||
 		    strcmp(pname, "Xipi_intr_bitmap_handler") == 0 ||
 		    strcmp(pname, "Xlazypmap") == 0)
 			cache->frame_type = FT_INTRTRAPFRAME;
 		else
 			cache->frame_type = FT_INTRFRAME;
 		frame_unwind_register(next_frame, SP_REGNUM, buf);
 		cache->sp = extract_unsigned_integer(buf,
 		    register_size(current_gdbarch, SP_REGNUM));
 	}
 	return (cache);
 }
 
 static void
 kgdb_trgt_trapframe_this_id(struct frame_info *next_frame, void **this_cache,
     struct frame_id *this_id)
 {
 	struct kgdb_frame_cache *cache;
 
 	cache = kgdb_trgt_frame_cache(next_frame, this_cache);
 	*this_id = frame_id_build(cache->sp, cache->pc);
 }
 
 static void
 kgdb_trgt_trapframe_prev_register(struct frame_info *next_frame,
     void **this_cache, int regnum, int *optimizedp, enum lval_type *lvalp,
     CORE_ADDR *addrp, int *realnump, void *valuep)
 {
 	char dummy_valuep[MAX_REGISTER_SIZE];
 	struct kgdb_frame_cache *cache;
 	int ofs, regsz;
 
 	regsz = register_size(current_gdbarch, regnum);
 
 	if (valuep == NULL)
 		valuep = dummy_valuep;
 	memset(valuep, 0, regsz);
 	*optimizedp = 0;
 	*addrp = 0;
 	*lvalp = not_lval;
 	*realnump = -1;
 
 	ofs = (regnum >= I386_EAX_REGNUM && regnum <= I386_FS_REGNUM)
 	    ? kgdb_trgt_frame_offset[regnum] + ofs_fix : -1;
 	if (ofs == -1)
 		return;
 
 	cache = kgdb_trgt_frame_cache(next_frame, this_cache);
 	switch (cache->frame_type) {
 	case FT_NORMAL:
 		break;
 	case FT_INTRFRAME:
 		ofs += 4;
 		break;
 	case FT_TIMERFRAME:
 		break;
 	case FT_INTRTRAPFRAME:
 		ofs -= ofs_fix;
 		break;
 	default:
 		fprintf_unfiltered(gdb_stderr, "Correct FT_XXX frame offsets "
 		   "for %d\n", cache->frame_type);
 		break;
 	}
 	*addrp = cache->sp + ofs;
 	*lvalp = lval_memory;
 	target_read_memory(*addrp, valuep, regsz);
 }
 
 static const struct frame_unwind kgdb_trgt_trapframe_unwind = {
         UNKNOWN_FRAME,
         &kgdb_trgt_trapframe_this_id,
         &kgdb_trgt_trapframe_prev_register
 };
 
 const struct frame_unwind *
 kgdb_trgt_trapframe_sniffer(struct frame_info *next_frame)
 {
 	char *pname;
 	CORE_ADDR pc;
 
 	pc = frame_pc_unwind(next_frame);
+	if (pc >= PMAP_TRM_MIN_ADDRESS)
+		return (&kgdb_trgt_trapframe_unwind);
 	pname = NULL;
 	find_pc_partial_function(pc, &pname, NULL, NULL);
 	if (pname == NULL)
 		return (NULL);
 	if (strcmp(pname, "dblfault_handler") == 0)
 		return (&kgdb_trgt_dblfault_unwind);
 	if (strcmp(pname, "calltrap") == 0 ||
 	    (pname[0] == 'X' && pname[1] != '_'))
 		return (&kgdb_trgt_trapframe_unwind);
 	/* printf("%s: %llx =%s\n", __func__, pc, pname); */
 	return (NULL);
 }
 
 /*
  * This function ensures, that the PC is inside the
  * function section which is understood by GDB.
  *
  * Return 0 when fixup is necessary, -1 otherwise.
  */
 int
 kgdb_trgt_pc_fixup(CORE_ADDR *pc __unused)
 {
 
 	return (-1);
 }
Index: head/sys/conf/files.i386
===================================================================
--- head/sys/conf/files.i386	(revision 332488)
+++ head/sys/conf/files.i386	(revision 332489)
@@ -1,638 +1,639 @@
 # This file tells config what files go into building a kernel,
 # files marked standard are always included.
 #
 # $FreeBSD$
 #
 # The long compile-with and dependency lines are required because of
 # limitations in config: backslash-newline doesn't work in strings, and
 # dependency lines other than the first are silently ignored.
 #
 cloudabi32_vdso.o		optional	compat_cloudabi32	\
 	dependency	"$S/contrib/cloudabi/cloudabi_vdso_i686.S" \
 	compile-with	"${CC} -x assembler-with-cpp -shared -nostdinc -nostdlib -Wl,-T$S/compat/cloudabi/cloudabi_vdso.lds $S/contrib/cloudabi/cloudabi_vdso_i686.S -o ${.TARGET}" \
 	no-obj no-implicit-rule						\
 	clean		"cloudabi32_vdso.o"
 #
 cloudabi32_vdso_blob.o		optional	compat_cloudabi32	\
 	dependency 	"cloudabi32_vdso.o"				\
 	compile-with	"${OBJCOPY} --input-target binary --output-target elf32-i386-freebsd --binary-architecture i386 cloudabi32_vdso.o ${.TARGET}" \
 	no-implicit-rule						\
 	clean		"cloudabi32_vdso_blob.o"
 #
 linux_genassym.o		optional	compat_linux		\
 	dependency 	"$S/i386/linux/linux_genassym.c"		\
 	compile-with	"${CC} ${CFLAGS:N-flto:N-fno-common} -c ${.IMPSRC}" \
 	no-obj no-implicit-rule						\
 	clean		"linux_genassym.o"
 #
 linux_assym.h			optional	compat_linux		\
 	dependency 	"$S/kern/genassym.sh linux_genassym.o"		\
 	compile-with	"sh $S/kern/genassym.sh linux_genassym.o > ${.TARGET}" \
 	no-obj no-implicit-rule before-depend				\
 	clean		"linux_assym.h"
 #
 linux_locore.o			optional	compat_linux		\
 	dependency 	"linux_assym.h $S/i386/linux/linux_locore.s"	\
 	compile-with	"${CC} -x assembler-with-cpp -DLOCORE -shared -s -pipe -I. -I$S -Werror -Wall -fPIC -fno-common -nostdinc -nostdlib -Wl,-T$S/i386/linux/linux_vdso.lds.s -Wl,-soname=linux_vdso.so,--eh-frame-hdr,-warn-common ${.IMPSRC} -o ${.TARGET}" \
 	no-obj no-implicit-rule						\
 	clean		"linux_locore.o"
 #
 linux_vdso.so			optional	compat_linux		\
 	dependency 	"linux_locore.o"				\
 	compile-with	"${OBJCOPY} --input-target binary --output-target elf32-i386-freebsd --binary-architecture i386 linux_locore.o ${.TARGET}" \
 	no-implicit-rule						\
 	clean		"linux_vdso.so"
 #
 font.h				optional	sc_dflt_font		\
 	compile-with	"uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x16.fnt && file2c 'static u_char dflt_font_16[16*256] = {' '};' < ${SC_DFLT_FONT}-8x16 > font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x14.fnt && file2c 'static u_char dflt_font_14[14*256] = {' '};' < ${SC_DFLT_FONT}-8x14 >> font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x8.fnt && file2c 'static u_char dflt_font_8[8*256] = {' '};' < ${SC_DFLT_FONT}-8x8 >> font.h"										\
 	no-obj no-implicit-rule before-depend				\
 	clean		"font.h ${SC_DFLT_FONT}-8x14 ${SC_DFLT_FONT}-8x16 ${SC_DFLT_FONT}-8x8"
 #
 atkbdmap.h			optional	atkbd_dflt_keymap	\
 	compile-with	"kbdcontrol -P ${S:S/sys$/share/}/vt/keymaps -P ${S:S/sys$/share/}/syscons/keymaps -L ${ATKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > atkbdmap.h"			\
 	no-obj no-implicit-rule before-depend				\
 	clean		"atkbdmap.h"
 #
 ukbdmap.h			optional	ukbd_dflt_keymap	\
 	compile-with	"kbdcontrol -P ${S:S/sys$/share/}/vt/keymaps -P ${S:S/sys$/share/}/syscons/keymaps -L ${UKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > ukbdmap.h"			\
 	no-obj no-implicit-rule before-depend				\
 	clean		"ukbdmap.h"
 #
 hpt27xx_lib.o			optional	hpt27xx			\
 	dependency	"$S/dev/hpt27xx/i386-elf.hpt27xx_lib.o.uu"	\
 	compile-with	"uudecode < $S/dev/hpt27xx/i386-elf.hpt27xx_lib.o.uu" \
 	no-implicit-rule
 #
 hptmvraid.o			optional	hptmv			\
 	dependency	"$S/dev/hptmv/i386-elf.raid.o.uu"		\
 	compile-with	"uudecode < $S/dev/hptmv/i386-elf.raid.o.uu"	\
 	no-implicit-rule
 #
 hptnr_lib.o			optional	hptnr			\
 	dependency	"$S/dev/hptnr/i386-elf.hptnr_lib.o.uu"	\
 	compile-with	"uudecode < $S/dev/hptnr/i386-elf.hptnr_lib.o.uu" \
 	no-implicit-rule
 #
 hptrr_lib.o			optional	hptrr			\
 	dependency	"$S/dev/hptrr/i386-elf.hptrr_lib.o.uu"		\
 	compile-with	"uudecode < $S/dev/hptrr/i386-elf.hptrr_lib.o.uu" \
 	no-implicit-rule
 #
 cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S	optional zfs | dtrace compile-with "${ZFS_S}"
 cddl/dev/dtrace/i386/dtrace_asm.S			optional dtrace compile-with "${DTRACE_S}"
 cddl/dev/dtrace/i386/dtrace_subr.c			optional dtrace compile-with "${DTRACE_C}"
 cddl/dev/fbt/x86/fbt_isa.c				optional dtrace_fbt | dtraceall compile-with "${FBT_C}"
 cddl/dev/dtrace/x86/dis_tables.c			optional dtrace_fbt | dtraceall compile-with "${DTRACE_C}"
 cddl/dev/dtrace/x86/instr_size.c			optional dtrace_fbt | dtraceall compile-with "${DTRACE_C}"
 compat/linprocfs/linprocfs.c	optional linprocfs
 compat/linsysfs/linsysfs.c	optional linsysfs
 compat/linux/linux_event.c	optional compat_linux
 compat/linux/linux_emul.c	optional compat_linux
 compat/linux/linux_errno.c	optional compat_linux
 compat/linux/linux_file.c	optional compat_linux
 compat/linux/linux_fork.c	optional compat_linux
 compat/linux/linux_futex.c	optional compat_linux
 compat/linux/linux_getcwd.c	optional compat_linux
 compat/linux/linux_ioctl.c	optional compat_linux
 compat/linux/linux_ipc.c	optional compat_linux
 compat/linux/linux_mib.c	optional compat_linux
 compat/linux/linux_misc.c	optional compat_linux
 compat/linux/linux_mmap.c	optional compat_linux
 compat/linux/linux_signal.c	optional compat_linux
 compat/linux/linux_socket.c	optional compat_linux
 compat/linux/linux_stats.c	optional compat_linux
 compat/linux/linux_sysctl.c	optional compat_linux
 compat/linux/linux_time.c	optional compat_linux
 compat/linux/linux_timer.c	optional compat_linux
 compat/linux/linux_uid16.c	optional compat_linux
 compat/linux/linux_util.c	optional compat_linux
 compat/linux/linux_vdso.c	optional compat_linux
 compat/linux/linux.c		optional compat_linux
 compat/ndis/kern_ndis.c		optional ndisapi pci
 compat/ndis/kern_windrv.c	optional ndisapi pci
 compat/ndis/subr_hal.c		optional ndisapi pci
 compat/ndis/subr_ndis.c		optional ndisapi pci
 compat/ndis/subr_ntoskrnl.c	optional ndisapi pci
 compat/ndis/subr_pe.c		optional ndisapi pci
 compat/ndis/subr_usbd.c		optional ndisapi pci
 compat/ndis/winx32_wrap.S	optional ndisapi pci
 bf_enc.o			optional crypto | ipsec	| ipsec_support \
 	dependency	"$S/crypto/blowfish/arch/i386/bf_enc.S $S/crypto/blowfish/arch/i386/bf_enc_586.S $S/crypto/blowfish/arch/i386/bf_enc_686.S" \
 	compile-with	"${CC} -c -I$S/crypto/blowfish/arch/i386 ${ASM_CFLAGS} ${WERROR} ${.IMPSRC}" \
 	no-implicit-rule
 crypto/aesni/aeskeys_i386.S	optional aesni
 crypto/aesni/aesni.c		optional aesni
 aesni_ghash.o			optional aesni				\
 	dependency	"$S/crypto/aesni/aesni_ghash.c"			\
 	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes -mpclmul ${.IMPSRC}" \
 	no-implicit-rule						\
 	clean		"aesni_ghash.o"
 aesni_wrap.o			optional aesni				\
 	dependency	"$S/crypto/aesni/aesni_wrap.c"			\
 	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes ${.IMPSRC}" \
 	no-implicit-rule						\
 	clean		"aesni_wrap.o"
 crypto/des/arch/i386/des_enc.S	optional crypto | ipsec | ipsec_support | netsmb
 intel_sha1.o			optional	aesni			\
 	dependency	"$S/crypto/aesni/intel_sha1.c"			\
 	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
 	no-implicit-rule						\
 	clean		"intel_sha1.o"
 intel_sha256.o			optional	aesni			\
 	dependency	"$S/crypto/aesni/intel_sha256.c"		\
 	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
 	no-implicit-rule						\
 	clean		"intel_sha256.o"
 crypto/via/padlock.c		optional padlock
 crypto/via/padlock_cipher.c	optional padlock
 crypto/via/padlock_hash.c	optional padlock
 dev/acpica/acpi_pci.c		optional acpi pci
 dev/acpica/acpi_pci_link.c	optional acpi pci
 dev/acpica/acpi_pcib.c		optional acpi pci
 dev/acpica/acpi_pcib_acpi.c	optional acpi pci
 dev/acpica/acpi_pcib_pci.c	optional acpi pci
 dev/advansys/adv_isa.c		optional adv isa
 dev/agp/agp_ali.c		optional agp
 dev/agp/agp_amd.c		optional agp
 dev/agp/agp_amd64.c		optional agp
 dev/agp/agp_ati.c		optional agp
 dev/agp/agp_i810.c		optional agp
 dev/agp/agp_intel.c		optional agp
 dev/agp/agp_nvidia.c		optional agp
 dev/agp/agp_sis.c		optional agp
 dev/agp/agp_via.c		optional agp
 dev/aic/aic_isa.c		optional aic isa
 dev/amdsbwd/amdsbwd.c		optional amdsbwd
 dev/amdsmn/amdsmn.c		optional amdsmn | amdtemp
 dev/amdtemp/amdtemp.c		optional amdtemp
 dev/arcmsr/arcmsr.c		optional arcmsr pci
 dev/asmc/asmc.c			optional asmc isa
 dev/atkbdc/atkbd.c		optional atkbd atkbdc
 dev/atkbdc/atkbd_atkbdc.c	optional atkbd atkbdc
 dev/atkbdc/atkbdc.c		optional atkbdc
 dev/atkbdc/atkbdc_isa.c		optional atkbdc isa
 dev/atkbdc/atkbdc_subr.c	optional atkbdc
 dev/atkbdc/psm.c		optional psm atkbdc
 dev/bxe/bxe.c                   optional bxe pci
 dev/bxe/bxe_stats.c             optional bxe pci
 dev/bxe/bxe_debug.c             optional bxe pci
 dev/bxe/ecore_sp.c              optional bxe pci
 dev/bxe/bxe_elink.c             optional bxe pci
 dev/bxe/57710_init_values.c     optional bxe pci
 dev/bxe/57711_init_values.c     optional bxe pci
 dev/bxe/57712_init_values.c     optional bxe pci
 dev/ce/ceddk.c			optional ce
 dev/ce/if_ce.c			optional ce
 dev/ce/tau32-ddk.c		optional ce \
 	compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
 dev/cm/if_cm_isa.c		optional cm isa
 dev/coretemp/coretemp.c		optional coretemp
 dev/cp/cpddk.c			optional cp
 dev/cp/if_cp.c			optional cp
 dev/cpuctl/cpuctl.c		optional cpuctl
 dev/ctau/ctau.c			optional ctau
 dev/ctau/ctddk.c		optional ctau
 dev/ctau/if_ct.c		optional ctau
 dev/cx/csigma.c			optional cx
 dev/cx/cxddk.c			optional cx
 dev/cx/if_cx.c			optional cx
 dev/dpms/dpms.c			optional dpms
 dev/ed/if_ed_3c503.c		optional ed isa ed_3c503
 dev/ed/if_ed_isa.c		optional ed isa
 dev/ed/if_ed_wd80x3.c		optional ed isa
 dev/ed/if_ed_hpp.c		optional ed isa ed_hpp
 dev/ed/if_ed_sic.c		optional ed isa ed_sic
 dev/ep/elink.c			optional ep
 dev/fb/fb.c			optional fb | vga
 dev/fb/s3_pci.c			optional s3pci
 dev/fb/vesa.c			optional vga vesa
 dev/fb/vga.c			optional vga
 dev/fdc/fdc.c			optional fdc
 dev/fdc/fdc_acpi.c		optional fdc
 dev/fdc/fdc_isa.c		optional fdc isa
 dev/fdc/fdc_pccard.c		optional fdc pccard
 dev/fe/if_fe_isa.c		optional fe isa
 dev/glxiic/glxiic.c		optional glxiic
 dev/glxsb/glxsb.c		optional glxsb
 dev/glxsb/glxsb_hash.c		optional glxsb
 dev/gpio/bytgpio.c		optional	bytgpio
 dev/gpio/chvgpio.c		optional	chvgpio
 dev/hpt27xx/hpt27xx_os_bsd.c	optional hpt27xx
 dev/hpt27xx/hpt27xx_osm_bsd.c	optional hpt27xx
 dev/hpt27xx/hpt27xx_config.c	optional hpt27xx
 dev/hptmv/entry.c		optional hptmv
 dev/hptmv/mv.c			optional hptmv
 dev/hptmv/gui_lib.c		optional hptmv
 dev/hptmv/hptproc.c		optional hptmv
 dev/hptmv/ioctl.c		optional hptmv
 dev/hptnr/hptnr_os_bsd.c	optional hptnr
 dev/hptnr/hptnr_osm_bsd.c	optional hptnr
 dev/hptnr/hptnr_config.c	optional hptnr
 dev/hptrr/hptrr_os_bsd.c	optional hptrr
 dev/hptrr/hptrr_osm_bsd.c	optional hptrr
 dev/hptrr/hptrr_config.c	optional hptrr
 dev/hwpmc/hwpmc_amd.c		optional hwpmc
 dev/hwpmc/hwpmc_intel.c		optional hwpmc
 dev/hwpmc/hwpmc_core.c		optional hwpmc
 dev/hwpmc/hwpmc_uncore.c	optional hwpmc
 dev/hwpmc/hwpmc_pentium.c	optional hwpmc
 dev/hwpmc/hwpmc_piv.c		optional hwpmc
 dev/hwpmc/hwpmc_ppro.c		optional hwpmc
 dev/hwpmc/hwpmc_tsc.c		optional hwpmc
 dev/hwpmc/hwpmc_x86.c		optional hwpmc
 dev/hyperv/pcib/vmbus_pcib.c				optional	hyperv pci
 dev/hyperv/netvsc/hn_nvs.c				optional	hyperv
 dev/hyperv/netvsc/hn_rndis.c				optional	hyperv
 dev/hyperv/netvsc/if_hn.c				optional	hyperv
 dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c		optional	hyperv
 dev/hyperv/utilities/hv_kvp.c				optional	hyperv
 dev/hyperv/utilities/hv_snapshot.c			optional	hyperv
 dev/hyperv/utilities/vmbus_heartbeat.c			optional	hyperv
 dev/hyperv/utilities/vmbus_ic.c				optional	hyperv
 dev/hyperv/utilities/vmbus_shutdown.c			optional	hyperv
 dev/hyperv/utilities/vmbus_timesync.c			optional	hyperv
 dev/hyperv/vmbus/hyperv.c				optional	hyperv
 dev/hyperv/vmbus/hyperv_busdma.c			optional	hyperv
 dev/hyperv/vmbus/vmbus.c				optional	hyperv pci
 dev/hyperv/vmbus/vmbus_br.c				optional	hyperv
 dev/hyperv/vmbus/vmbus_chan.c				optional	hyperv
 dev/hyperv/vmbus/vmbus_et.c				optional	hyperv
 dev/hyperv/vmbus/vmbus_if.m				optional	hyperv
 dev/hyperv/vmbus/vmbus_res.c				optional	hyperv
 dev/hyperv/vmbus/vmbus_xact.c				optional	hyperv
 dev/hyperv/vmbus/i386/hyperv_machdep.c			optional	hyperv
 dev/hyperv/vmbus/i386/vmbus_vector.S			optional	hyperv
 dev/ichwd/ichwd.c		optional ichwd
 dev/if_ndis/if_ndis.c		optional ndis
 dev/if_ndis/if_ndis_pccard.c	optional ndis pccard
 dev/if_ndis/if_ndis_pci.c	optional ndis cardbus | ndis pci
 dev/if_ndis/if_ndis_usb.c	optional ndis usb
 dev/imcsmb/imcsmb.c		optional imcsmb
 dev/imcsmb/imcsmb_pci.c		optional imcsmb pci
 dev/intel/spi.c			optional intelspi
 dev/io/iodev.c			optional io
 dev/ipmi/ipmi.c			optional ipmi
 dev/ipmi/ipmi_acpi.c		optional ipmi acpi
 dev/ipmi/ipmi_isa.c		optional ipmi isa
 dev/ipmi/ipmi_kcs.c		optional ipmi
 dev/ipmi/ipmi_smic.c		optional ipmi
 dev/ipmi/ipmi_smbus.c		optional ipmi smbus
 dev/ipmi/ipmi_smbios.c		optional ipmi
 dev/ipmi/ipmi_ssif.c		optional ipmi smbus
 dev/ipmi/ipmi_pci.c		optional ipmi pci
 dev/ipmi/ipmi_linux.c		optional ipmi compat_linux
 dev/le/if_le_isa.c		optional le isa
 dev/mse/mse.c			optional mse
 dev/mse/mse_isa.c		optional mse isa
 dev/nctgpio/nctgpio.c		optional nctgpio
 dev/nfe/if_nfe.c		optional nfe pci
 dev/ntb/if_ntb/if_ntb.c		optional if_ntb
 dev/ntb/ntb_transport.c		optional ntb_transport | if_ntb
 dev/ntb/ntb.c			optional ntb | ntb_transport | if_ntb | ntb_hw_intel | ntb_hw_plx | ntb_hw
 dev/ntb/ntb_if.m		optional ntb | ntb_transport | if_ntb | ntb_hw_intel | ntb_hw_plx | ntb_hw
 dev/ntb/ntb_hw/ntb_hw_intel.c	optional ntb_hw_intel | ntb_hw
 dev/ntb/ntb_hw/ntb_hw_plx.c	optional ntb_hw_plx | ntb_hw
 dev/nvd/nvd.c			optional nvd nvme
 dev/nvme/nvme.c			optional nvme
 dev/nvme/nvme_ctrlr.c		optional nvme
 dev/nvme/nvme_ctrlr_cmd.c	optional nvme
 dev/nvme/nvme_ns.c		optional nvme
 dev/nvme/nvme_ns_cmd.c		optional nvme
 dev/nvme/nvme_qpair.c		optional nvme
 dev/nvme/nvme_sysctl.c		optional nvme
 dev/nvme/nvme_test.c		optional nvme
 dev/nvme/nvme_util.c		optional nvme
 dev/nvram/nvram.c		optional nvram isa
 dev/ofw/ofwpci.c		optional fdt pci
 dev/pcf/pcf_isa.c		optional pcf
 dev/random/ivy.c		optional rdrand_rng
 dev/random/nehemiah.c		optional padlock_rng
 dev/sbni/if_sbni.c		optional sbni
 dev/sbni/if_sbni_isa.c		optional sbni isa
 dev/sbni/if_sbni_pci.c		optional sbni pci
 dev/sio/sio.c			optional sio
 dev/sio/sio_isa.c		optional sio isa
 dev/sio/sio_pccard.c		optional sio pccard
 dev/sio/sio_pci.c		optional sio pci
 dev/sio/sio_puc.c		optional sio puc
 dev/speaker/spkr.c		optional speaker
 dev/syscons/apm/apm_saver.c	optional apm_saver apm
 dev/syscons/scterm-teken.c	optional sc
 dev/syscons/scvesactl.c		optional sc vga vesa
 dev/syscons/scvgarndr.c		optional sc vga
 dev/syscons/scvtb.c		optional sc
 dev/tpm/tpm.c			optional tpm
 dev/tpm/tpm_acpi.c		optional tpm acpi
 dev/tpm/tpm_isa.c		optional tpm isa
 dev/uart/uart_cpu_x86.c		optional uart
 dev/viawd/viawd.c		optional viawd
 dev/vmware/vmxnet3/if_vmx.c	optional vmx
 dev/vmware/vmci/vmci.c			optional	vmci
 dev/vmware/vmci/vmci_datagram.c		optional	vmci
 dev/vmware/vmci/vmci_doorbell.c		optional	vmci
 dev/vmware/vmci/vmci_driver.c		optional	vmci
 dev/vmware/vmci/vmci_event.c		optional	vmci
 dev/vmware/vmci/vmci_hashtable.c	optional	vmci
 dev/vmware/vmci/vmci_kernel_if.c	optional	vmci
 dev/vmware/vmci/vmci_qpair.c		optional	vmci
 dev/vmware/vmci/vmci_queue_pair.c	optional	vmci
 dev/vmware/vmci/vmci_resource.c		optional	vmci
 dev/acpica/acpi_if.m		standard
 dev/acpica/acpi_hpet.c		optional acpi
 dev/acpica/acpi_timer.c		optional acpi
 dev/acpi_support/acpi_wmi_if.m	standard
 dev/wbwd/wbwd.c			optional wbwd
 dev/isci/isci.c							optional isci
 dev/isci/isci_controller.c					optional isci
 dev/isci/isci_domain.c						optional isci
 dev/isci/isci_interrupt.c					optional isci
 dev/isci/isci_io_request.c					optional isci
 dev/isci/isci_logger.c						optional isci
 dev/isci/isci_oem_parameters.c					optional isci
 dev/isci/isci_remote_device.c					optional isci
 dev/isci/isci_sysctl.c						optional isci
 dev/isci/isci_task_request.c					optional isci
 dev/isci/isci_timer.c						optional isci
 dev/isci/scil/sati.c						optional isci
 dev/isci/scil/sati_abort_task_set.c				optional isci
 dev/isci/scil/sati_atapi.c					optional isci
 dev/isci/scil/sati_device.c					optional isci
 dev/isci/scil/sati_inquiry.c					optional isci
 dev/isci/scil/sati_log_sense.c					optional isci
 dev/isci/scil/sati_lun_reset.c					optional isci
 dev/isci/scil/sati_mode_pages.c					optional isci
 dev/isci/scil/sati_mode_select.c				optional isci
 dev/isci/scil/sati_mode_sense.c					optional isci
 dev/isci/scil/sati_mode_sense_10.c				optional isci
 dev/isci/scil/sati_mode_sense_6.c				optional isci
 dev/isci/scil/sati_move.c					optional isci
 dev/isci/scil/sati_passthrough.c				optional isci
 dev/isci/scil/sati_read.c					optional isci
 dev/isci/scil/sati_read_buffer.c				optional isci
 dev/isci/scil/sati_read_capacity.c				optional isci
 dev/isci/scil/sati_reassign_blocks.c				optional isci
 dev/isci/scil/sati_report_luns.c				optional isci
 dev/isci/scil/sati_request_sense.c				optional isci
 dev/isci/scil/sati_start_stop_unit.c				optional isci
 dev/isci/scil/sati_synchronize_cache.c				optional isci
 dev/isci/scil/sati_test_unit_ready.c				optional isci
 dev/isci/scil/sati_unmap.c					optional isci
 dev/isci/scil/sati_util.c					optional isci
 dev/isci/scil/sati_verify.c					optional isci
 dev/isci/scil/sati_write.c					optional isci
 dev/isci/scil/sati_write_and_verify.c				optional isci
 dev/isci/scil/sati_write_buffer.c				optional isci
 dev/isci/scil/sati_write_long.c					optional isci
 dev/isci/scil/sci_abstract_list.c				optional isci
 dev/isci/scil/sci_base_controller.c				optional isci
 dev/isci/scil/sci_base_domain.c					optional isci
 dev/isci/scil/sci_base_iterator.c				optional isci
 dev/isci/scil/sci_base_library.c				optional isci
 dev/isci/scil/sci_base_logger.c					optional isci
 dev/isci/scil/sci_base_memory_descriptor_list.c			optional isci
 dev/isci/scil/sci_base_memory_descriptor_list_decorator.c	optional isci
 dev/isci/scil/sci_base_object.c					optional isci
 dev/isci/scil/sci_base_observer.c				optional isci
 dev/isci/scil/sci_base_phy.c					optional isci
 dev/isci/scil/sci_base_port.c					optional isci
 dev/isci/scil/sci_base_remote_device.c				optional isci
 dev/isci/scil/sci_base_request.c				optional isci
 dev/isci/scil/sci_base_state_machine.c				optional isci
 dev/isci/scil/sci_base_state_machine_logger.c			optional isci
 dev/isci/scil/sci_base_state_machine_observer.c			optional isci
 dev/isci/scil/sci_base_subject.c				optional isci
 dev/isci/scil/sci_util.c					optional isci
 dev/isci/scil/scic_sds_controller.c				optional isci
 dev/isci/scil/scic_sds_library.c				optional isci
 dev/isci/scil/scic_sds_pci.c					optional isci
 dev/isci/scil/scic_sds_phy.c					optional isci
 dev/isci/scil/scic_sds_port.c					optional isci
 dev/isci/scil/scic_sds_port_configuration_agent.c		optional isci
 dev/isci/scil/scic_sds_remote_device.c				optional isci
 dev/isci/scil/scic_sds_remote_node_context.c			optional isci
 dev/isci/scil/scic_sds_remote_node_table.c			optional isci
 dev/isci/scil/scic_sds_request.c				optional isci
 dev/isci/scil/scic_sds_sgpio.c					optional isci
 dev/isci/scil/scic_sds_smp_remote_device.c			optional isci
 dev/isci/scil/scic_sds_smp_request.c				optional isci
 dev/isci/scil/scic_sds_ssp_request.c				optional isci
 dev/isci/scil/scic_sds_stp_packet_request.c			optional isci
 dev/isci/scil/scic_sds_stp_remote_device.c			optional isci
 dev/isci/scil/scic_sds_stp_request.c				optional isci
 dev/isci/scil/scic_sds_unsolicited_frame_control.c		optional isci
 dev/isci/scil/scif_sas_controller.c				optional isci
 dev/isci/scil/scif_sas_controller_state_handlers.c		optional isci
 dev/isci/scil/scif_sas_controller_states.c			optional isci
 dev/isci/scil/scif_sas_domain.c					optional isci
 dev/isci/scil/scif_sas_domain_state_handlers.c			optional isci
 dev/isci/scil/scif_sas_domain_states.c				optional isci
 dev/isci/scil/scif_sas_high_priority_request_queue.c		optional isci
 dev/isci/scil/scif_sas_internal_io_request.c			optional isci
 dev/isci/scil/scif_sas_io_request.c				optional isci
 dev/isci/scil/scif_sas_io_request_state_handlers.c		optional isci
 dev/isci/scil/scif_sas_io_request_states.c			optional isci
 dev/isci/scil/scif_sas_library.c				optional isci
 dev/isci/scil/scif_sas_remote_device.c				optional isci
 dev/isci/scil/scif_sas_remote_device_ready_substate_handlers.c	optional isci
 dev/isci/scil/scif_sas_remote_device_ready_substates.c		optional isci
 dev/isci/scil/scif_sas_remote_device_starting_substate_handlers.c		optional isci
 dev/isci/scil/scif_sas_remote_device_starting_substates.c	optional isci
 dev/isci/scil/scif_sas_remote_device_state_handlers.c		optional isci
 dev/isci/scil/scif_sas_remote_device_states.c			optional isci
 dev/isci/scil/scif_sas_request.c				optional isci
 dev/isci/scil/scif_sas_smp_activity_clear_affiliation.c		optional isci
 dev/isci/scil/scif_sas_smp_io_request.c				optional isci
 dev/isci/scil/scif_sas_smp_phy.c				optional isci
 dev/isci/scil/scif_sas_smp_remote_device.c			optional isci
 dev/isci/scil/scif_sas_stp_io_request.c				optional isci
 dev/isci/scil/scif_sas_stp_remote_device.c			optional isci
 dev/isci/scil/scif_sas_stp_task_request.c			optional isci
 dev/isci/scil/scif_sas_task_request.c				optional isci
 dev/isci/scil/scif_sas_task_request_state_handlers.c		optional isci
 dev/isci/scil/scif_sas_task_request_states.c			optional isci
 dev/isci/scil/scif_sas_timer.c					optional isci
 i386/acpica/acpi_machdep.c	optional acpi
 acpi_wakecode.o			optional acpi				\
 	dependency	"$S/i386/acpica/acpi_wakecode.S assym.inc"	\
 	compile-with	"${NORMAL_S}"					\
 	no-obj no-implicit-rule before-depend				\
 	clean		"acpi_wakecode.o"
 acpi_wakecode.bin		optional acpi				\
 	dependency	"acpi_wakecode.o"				\
 	compile-with	"${OBJCOPY} -S -O binary acpi_wakecode.o ${.TARGET}" \
 	no-obj no-implicit-rule	before-depend				\
 	clean		"acpi_wakecode.bin"
 acpi_wakecode.h			optional acpi				\
 	dependency	"acpi_wakecode.bin"				\
 	compile-with	"file2c -sx 'static char wakecode[] = {' '};' < acpi_wakecode.bin > ${.TARGET}" \
 	no-obj no-implicit-rule	before-depend				\
 	clean		"acpi_wakecode.h"
 acpi_wakedata.h			optional acpi				\
 	dependency	"acpi_wakecode.o"				\
 	compile-with	'${NM} -n --defined-only acpi_wakecode.o | while read offset dummy what; do echo "#define	$${what}	0x$${offset}"; done > ${.TARGET}' \
 	no-obj no-implicit-rule	before-depend				\
 	clean		"acpi_wakedata.h"
 #
 i386/bios/apm.c			optional apm
 i386/bios/smapi.c		optional smapi
 i386/bios/smapi_bios.S		optional smapi
 i386/cloudabi32/cloudabi32_sysvec.c	optional compat_cloudabi32
 #i386/i386/apic_vector.s		optional apic
 i386/i386/atomic.c		standard		\
 	compile-with	"${CC} -c ${CFLAGS} ${DEFINED_PROF:S/^$/-fomit-frame-pointer/} ${.IMPSRC}"
 i386/i386/bios.c		standard
 i386/i386/bioscall.s		standard
 i386/i386/bpf_jit_machdep.c	optional bpf_jitter
+i386/i386/copyout.c		standard
 i386/i386/db_disasm.c		optional ddb
 i386/i386/db_interface.c	optional ddb
 i386/i386/db_trace.c		optional ddb
 i386/i386/elan-mmcr.c		optional cpu_elan | cpu_soekris
 i386/i386/elf_machdep.c		standard
 i386/i386/exception.s		standard
 i386/i386/gdb_machdep.c		optional gdb
 i386/i386/geode.c		optional cpu_geode
 i386/i386/in_cksum.c		optional inet | inet6
 i386/i386/initcpu.c		standard
 i386/i386/io.c			optional io
 i386/i386/k6_mem.c		optional mem
 i386/i386/locore.s		standard	no-obj
 i386/i386/longrun.c		optional cpu_enable_longrun
 i386/i386/machdep.c		standard
 i386/i386/mem.c			optional mem
 i386/i386/minidump_machdep.c	standard
 i386/i386/mp_clock.c		optional smp
 i386/i386/mp_machdep.c		optional smp
 i386/i386/mpboot.s		optional smp
 i386/i386/npx.c			standard
 i386/i386/perfmon.c		optional perfmon
 i386/i386/pmap.c		standard
 i386/i386/prof_machdep.c	optional profiling-routine
 i386/i386/ptrace_machdep.c	standard
 i386/i386/sigtramp.s		standard
 i386/i386/support.s		standard
 i386/i386/swtch.s		standard
 i386/i386/sys_machdep.c		standard
 i386/i386/trap.c		standard
 i386/i386/uio_machdep.c		standard
 i386/i386/vm86.c		standard
 i386/i386/vm_machdep.c		standard
 i386/ibcs2/ibcs2_errno.c	optional ibcs2
 i386/ibcs2/ibcs2_fcntl.c	optional ibcs2
 i386/ibcs2/ibcs2_ioctl.c	optional ibcs2
 i386/ibcs2/ibcs2_ipc.c		optional ibcs2
 i386/ibcs2/ibcs2_isc.c		optional ibcs2
 i386/ibcs2/ibcs2_isc_sysent.c	optional ibcs2
 i386/ibcs2/ibcs2_misc.c		optional ibcs2
 i386/ibcs2/ibcs2_msg.c		optional ibcs2
 i386/ibcs2/ibcs2_other.c	optional ibcs2
 i386/ibcs2/ibcs2_signal.c	optional ibcs2
 i386/ibcs2/ibcs2_socksys.c	optional ibcs2
 i386/ibcs2/ibcs2_stat.c		optional ibcs2
 i386/ibcs2/ibcs2_sysent.c	optional ibcs2
 i386/ibcs2/ibcs2_sysi86.c	optional ibcs2
 i386/ibcs2/ibcs2_sysvec.c	optional ibcs2
 i386/ibcs2/ibcs2_util.c		optional ibcs2
 i386/ibcs2/ibcs2_xenix.c	optional ibcs2
 i386/ibcs2/ibcs2_xenix_sysent.c	optional ibcs2
 i386/ibcs2/imgact_coff.c	optional ibcs2
 i386/linux/imgact_linux.c	optional compat_linux
 i386/linux/linux_dummy.c	optional compat_linux
 i386/linux/linux_machdep.c	optional compat_linux
 i386/linux/linux_ptrace.c	optional compat_linux
 i386/linux/linux_support.s	optional compat_linux	\
 	dependency 	"linux_assym.h"
 i386/linux/linux_sysent.c	optional compat_linux
 i386/linux/linux_sysvec.c	optional compat_linux
 i386/pci/pci_cfgreg.c		optional pci
 i386/pci/pci_pir.c		optional pci
 isa/syscons_isa.c		optional sc
 isa/vga_isa.c			optional vga
 kern/kern_clocksource.c		standard
 kern/imgact_aout.c		optional compat_aout
 kern/imgact_gzip.c		optional gzip
 kern/subr_sfbuf.c		standard
 libkern/divdi3.c		standard
 libkern/ffsll.c			standard
 libkern/flsll.c			standard
 libkern/memmove.c		standard
 libkern/memset.c		standard
 libkern/moddi3.c		standard
 libkern/qdivrem.c		standard
 libkern/ucmpdi2.c		standard
 libkern/udivdi3.c		standard
 libkern/umoddi3.c		standard
 libkern/x86/crc32_sse42.c	standard
 #
 # x86 real mode BIOS support, required by dpms/pci/vesa
 #
 compat/x86bios/x86bios.c	optional x86bios | dpms | pci | vesa
 #
 # bvm console
 #
 dev/bvm/bvm_console.c		optional	bvmconsole
 dev/bvm/bvm_dbg.c		optional	bvmdebug
 #
 # x86 shared code between IA32 and AMD64 architectures
 #
 x86/acpica/OsdEnvironment.c	optional acpi
 x86/acpica/acpi_apm.c		optional acpi
 x86/acpica/acpi_wakeup.c	optional acpi
 x86/acpica/madt.c		optional acpi apic
 x86/acpica/srat.c		optional acpi
 x86/bios/smbios.c		optional smbios
 x86/bios/vpd.c			optional vpd
 x86/cpufreq/est.c		optional cpufreq
 x86/cpufreq/hwpstate.c		optional cpufreq
 x86/cpufreq/p4tcc.c		optional cpufreq
 x86/cpufreq/powernow.c		optional cpufreq
 x86/cpufreq/smist.c		optional cpufreq
 x86/iommu/busdma_dmar.c		optional acpi acpi_dmar pci
 x86/iommu/intel_ctx.c		optional acpi acpi_dmar pci
 x86/iommu/intel_drv.c		optional acpi acpi_dmar pci
 x86/iommu/intel_fault.c		optional acpi acpi_dmar pci
 x86/iommu/intel_gas.c		optional acpi acpi_dmar pci
 x86/iommu/intel_idpgtbl.c	optional acpi acpi_dmar pci
 x86/iommu/intel_intrmap.c	optional acpi acpi_dmar pci
 x86/iommu/intel_qi.c		optional acpi acpi_dmar pci
 x86/iommu/intel_quirks.c	optional acpi acpi_dmar pci
 x86/iommu/intel_utils.c		optional acpi acpi_dmar pci
 x86/isa/atpic.c			optional atpic
 x86/isa/atrtc.c			standard
 x86/isa/clock.c			standard
 x86/isa/elcr.c			optional atpic | apic
 x86/isa/isa.c			optional isa
 x86/isa/isa_dma.c		optional isa
 x86/isa/nmi.c			standard
 x86/isa/orm.c			optional isa
 x86/pci/pci_bus.c		optional pci
 x86/pci/qpi.c			optional pci
 x86/x86/autoconf.c		standard
 x86/x86/bus_machdep.c		standard
 x86/x86/busdma_bounce.c		standard
 x86/x86/busdma_machdep.c	standard
 x86/x86/cpu_machdep.c		standard
 x86/x86/dump_machdep.c		standard
 x86/x86/fdt_machdep.c		optional fdt
 x86/x86/identcpu.c		standard
 x86/x86/intr_machdep.c		standard
 x86/x86/io_apic.c		optional apic
 x86/x86/legacy.c		standard
 x86/x86/local_apic.c		optional apic
 x86/x86/mca.c			standard
 x86/x86/x86_mem.c		optional mem
 x86/x86/mptable.c		optional apic
 x86/x86/mptable_pci.c		optional apic pci
 x86/x86/mp_x86.c		optional smp
 x86/x86/mp_watchdog.c		optional mp_watchdog smp
 x86/x86/msi.c			optional apic pci
 x86/x86/nexus.c			standard
 x86/x86/stack_machdep.c		optional ddb | stack
 x86/x86/tsc.c			standard
 x86/x86/pvclock.c		standard
 x86/x86/delay.c			standard
 x86/xen/hvm.c			optional xenhvm
 x86/xen/xen_intr.c		optional xenhvm
 x86/xen/xen_apic.c		optional xenhvm
 x86/xen/xenpv.c			optional xenhvm
 x86/xen/xen_nexus.c		optional xenhvm
 x86/xen/xen_msi.c		optional xenhvm
Index: head/sys/conf/ldscript.i386
===================================================================
--- head/sys/conf/ldscript.i386	(revision 332488)
+++ head/sys/conf/ldscript.i386	(revision 332489)
@@ -1,198 +1,198 @@
 /* $FreeBSD$ */
 OUTPUT_FORMAT("elf32-i386-freebsd", "elf32-i386-freebsd", "elf32-i386-freebsd")
 OUTPUT_ARCH(i386)
 ENTRY(btext)
 SEARCH_DIR(/usr/lib);
 SECTIONS
 {
   /* Read-only sections, merged into text segment: */
-  . = kernbase + kernload + SIZEOF_HEADERS;
+  . = kernbase + SIZEOF_HEADERS;
   .interp         : { *(.interp) }
   .hash           : { *(.hash) }
   .gnu.hash       : { *(.gnu.hash) }
   .dynsym         : { *(.dynsym) }
   .dynstr         : { *(.dynstr) }
   .gnu.version    : { *(.gnu.version) }
   .gnu.version_d  : { *(.gnu.version_d) }
   .gnu.version_r  : { *(.gnu.version_r) }
   .rel.init       : { *(.rel.init) }
   .rela.init      : { *(.rela.init) }
   .rel.text       : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) }
   .rela.text      : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) }
   .rel.fini       : { *(.rel.fini) }
   .rela.fini      : { *(.rela.fini) }
   .rel.rodata     : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) }
   .rela.rodata    : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) }
   .rel.data.rel.ro   : { *(.rel.data.rel.ro* .rel.gnu.linkonce.d.rel.ro.*) }
   .rela.data.rel.ro   : { *(.rela.data.rel.ro* .rela.gnu.linkonce.d.rel.ro.*) }
   .rel.data       : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) }
   .rela.data      : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) }
   .rel.tdata	  : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) }
   .rela.tdata	  : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) }
   .rel.tbss	  : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) }
   .rela.tbss	  : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) }
   .rel.ctors      : { *(.rel.ctors) }
   .rela.ctors     : { *(.rela.ctors) }
   .rel.dtors      : { *(.rel.dtors) }
   .rela.dtors     : { *(.rela.dtors) }
   .rel.got        : { *(.rel.got) }
   .rela.got       : { *(.rela.got) }
   .rel.bss        : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) }
   .rela.bss       : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) }
   .rel.plt        : { *(.rel.plt) }
   .rela.plt       : { *(.rela.plt) }
   .init           :
   {
     KEEP (*(.init))
   } =0xCCCCCCCC
   .plt            : { *(.plt) }
   .text           :
   {
     *(.text .stub .text.* .gnu.linkonce.t.*)
     KEEP (*(.text.*personality*))
     /* .gnu.warning sections are handled specially by elf32.em.  */
     *(.gnu.warning)
   } =0xCCCCCCCC
   .fini           :
   {
     KEEP (*(.fini))
   } =0xCCCCCCCC
   PROVIDE (__etext = .);
   PROVIDE (_etext = .);
   PROVIDE (etext = .);
   .rodata         : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
   .rodata1        : { *(.rodata1) }
   .eh_frame_hdr : { *(.eh_frame_hdr) }
   .eh_frame       : ONLY_IF_RO { KEEP (*(.eh_frame)) }
   .gcc_except_table   : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }
   /* Adjust the address for the data segment.  We want to adjust up to
      the same address within the page on the next page up.  */
   . = ALIGN (CONSTANT (MAXPAGESIZE)) - ((CONSTANT (MAXPAGESIZE) - .) & (CONSTANT (MAXPAGESIZE) - 1)); . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
   /* Exception handling  */
   .eh_frame       : ONLY_IF_RW { KEEP (*(.eh_frame)) }
   .gcc_except_table   : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }
   /* Thread Local Storage sections  */
   .tdata	  : { *(.tdata .tdata.* .gnu.linkonce.td.*) }
   .tbss		  : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) }
   .preinit_array     :
   {
     PROVIDE_HIDDEN (__preinit_array_start = .);
     KEEP (*(.preinit_array))
     PROVIDE_HIDDEN (__preinit_array_end = .);
   }
   .init_array     :
   {
      PROVIDE_HIDDEN (__init_array_start = .);
      KEEP (*(SORT(.init_array.*)))
      KEEP (*(.init_array))
      PROVIDE_HIDDEN (__init_array_end = .);
   }
   .fini_array     :
   {
     PROVIDE_HIDDEN (__fini_array_start = .);
     KEEP (*(.fini_array))
     KEEP (*(SORT(.fini_array.*)))
     PROVIDE_HIDDEN (__fini_array_end = .);
   }
   _start_ctors = .;
   PROVIDE (start_ctors = .);
   .ctors          :
   {
     /* gcc uses crtbegin.o to find the start of
        the constructors, so we make sure it is
        first.  Because this is a wildcard, it
        doesn't matter if the user does not
        actually link against crtbegin.o; the
        linker won't look for a file to match a
        wildcard.  The wildcard also means that it
        doesn't matter which directory crtbegin.o
        is in.  */
     KEEP (*crtbegin.o(.ctors))
     KEEP (*crtbegin?.o(.ctors))
     /* We don't want to include the .ctor section from
        the crtend.o file until after the sorted ctors.
        The .ctor section from the crtend file contains the
        end of ctors marker and it must be last */
     KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
     KEEP (*(SORT(.ctors.*)))
     KEEP (*(.ctors))
   }
   _stop_ctors = .;
   PROVIDE (stop_ctors = .);
   .dtors          :
   {
     KEEP (*crtbegin.o(.dtors))
     KEEP (*crtbegin?.o(.dtors))
     KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
     KEEP (*(SORT(.dtors.*)))
     KEEP (*(.dtors))
   }
   .jcr            : { KEEP (*(.jcr)) }
   .data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro* .gnu.linkonce.d.rel.ro.*) }
   .dynamic        : { *(.dynamic) }
   .got            : { *(.got) }
   . = DATA_SEGMENT_RELRO_END (12, .);
   .got.plt        : { *(.got.plt) }
   .data           :
   {
     *(.data .data.* .gnu.linkonce.d.*)
     KEEP (*(.gnu.linkonce.d.*personality*))
   }
   .data1          : { *(.data1) }
   _edata = .; PROVIDE (edata = .);
   __bss_start = .;
   .bss            :
   {
    *(.dynbss)
    *(.bss .bss.* .gnu.linkonce.b.*)
    *(COMMON)
    /* Align here to ensure that the .bss section occupies space up to
       _end.  Align after .bss to ensure correct alignment even if the
       .bss section disappears because there are no input sections.
       FIXME: Why do we need it? When there is no .bss section, we don't
       pad the .data section.  */
    . = ALIGN(. != 0 ? 32 / 8 : 1);
   }
   . = ALIGN(32 / 8);
   . = ALIGN(32 / 8);
   _end = .; PROVIDE (end = .);
   . = DATA_SEGMENT_END (.);
   /* Stabs debugging sections.  */
   .stab          0 : { *(.stab) }
   .stabstr       0 : { *(.stabstr) }
   .stab.excl     0 : { *(.stab.excl) }
   .stab.exclstr  0 : { *(.stab.exclstr) }
   .stab.index    0 : { *(.stab.index) }
   .stab.indexstr 0 : { *(.stab.indexstr) }
   .comment       0 : { *(.comment) }
   /* DWARF debug sections.
      Symbols in the DWARF debugging sections are relative to the beginning
      of the section so we begin them at 0.  */
   /* DWARF 1 */
   .debug          0 : { *(.debug) }
   .line           0 : { *(.line) }
   /* GNU DWARF 1 extensions */
   .debug_srcinfo  0 : { *(.debug_srcinfo) }
   .debug_sfnames  0 : { *(.debug_sfnames) }
   /* DWARF 1.1 and DWARF 2 */
   .debug_aranges  0 : { *(.debug_aranges) }
   .debug_pubnames 0 : { *(.debug_pubnames) }
   /* DWARF 2 */
   .debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
   .debug_abbrev   0 : { *(.debug_abbrev) }
   .debug_line     0 : { *(.debug_line) }
   .debug_frame    0 : { *(.debug_frame) }
   .debug_str      0 : { *(.debug_str) }
   .debug_loc      0 : { *(.debug_loc) }
   .debug_macinfo  0 : { *(.debug_macinfo) }
   /* SGI/MIPS DWARF 2 extensions */
   .debug_weaknames 0 : { *(.debug_weaknames) }
   .debug_funcnames 0 : { *(.debug_funcnames) }
   .debug_typenames 0 : { *(.debug_typenames) }
   .debug_varnames  0 : { *(.debug_varnames) }
   /* DWARF 3 */
   .debug_pubtypes 0 : { *(.debug_pubtypes) }
   .debug_ranges   0 : { *(.debug_ranges) }
   .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
   /DISCARD/ : { *(.note.GNU-stack) }
 }
Index: head/sys/dev/dcons/dcons_crom.c
===================================================================
--- head/sys/dev/dcons/dcons_crom.c	(revision 332488)
+++ head/sys/dev/dcons/dcons_crom.c	(revision 332489)
@@ -1,268 +1,272 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 2003
  * 	Hidetoshi Shimokawa. All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *
  *	This product includes software developed by Hidetoshi Shimokawa.
  *
  * 4. Neither the name of the author nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
  * $Id: dcons_crom.c,v 1.8 2003/10/23 15:47:21 simokawa Exp $
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 
 #include <sys/bus.h>
 #include <machine/bus.h>
 
 #include <dev/firewire/firewire.h>
 #include <dev/firewire/firewirereg.h>
 #include <dev/firewire/iec13213.h>
 #include <dev/dcons/dcons.h>
 #include <dev/dcons/dcons_os.h>
 
 #include <sys/cons.h>
 
 #if (defined(__i386__) || defined(__amd64__))
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <machine/segments.h> /* for idt */
 #endif
 
 static bus_addr_t dcons_paddr;
 
 static int force_console = 0;
 TUNABLE_INT("hw.firewire.dcons_crom.force_console", &force_console);
 
 #define ADDR_HI(x)	(((x) >> 24) & 0xffffff)
 #define ADDR_LO(x)	((x) & 0xffffff)
 
 struct dcons_crom_softc {
         struct firewire_dev_comm fd;
 	struct crom_chunk unit;
 	struct crom_chunk spec;
 	struct crom_chunk ver;
 	bus_dma_tag_t dma_tag;
 	bus_dmamap_t dma_map;
 	bus_addr_t bus_addr;
 	eventhandler_tag ehand;
 };
 
 static void
 dcons_crom_identify(driver_t *driver, device_t parent)
 {
 	BUS_ADD_CHILD(parent, 0, "dcons_crom", device_get_unit(parent));
 }
 
 static int
 dcons_crom_probe(device_t dev)
 {
 	device_t pa;
 
 	pa = device_get_parent(dev);
 	if(device_get_unit(dev) != device_get_unit(pa)){
 		return(ENXIO);
 	}
 
 	device_set_desc(dev, "dcons configuration ROM");
 	return (0);
 }
 
 #if (defined(__i386__) || defined(__amd64__))
 static void
 dcons_crom_expose_idt(struct dcons_crom_softc *sc)
 {
 	static off_t idt_paddr;
 
 	/* XXX */
+#ifdef __amd64__
 	idt_paddr = (char *)idt - (char *)KERNBASE;
+#else /* __i386__ */
+	idt_paddr = (off_t)pmap_kextract((vm_offset_t)idt);
+#endif
 
 	crom_add_entry(&sc->unit, DCONS_CSR_KEY_RESET_HI, ADDR_HI(idt_paddr));
 	crom_add_entry(&sc->unit, DCONS_CSR_KEY_RESET_LO, ADDR_LO(idt_paddr));
 }
 #endif
 
 static void
 dcons_crom_post_busreset(void *arg)
 {
 	struct dcons_crom_softc *sc;
 	struct crom_src *src;
 	struct crom_chunk *root;
 
 	sc = (struct dcons_crom_softc *) arg;
 	src = sc->fd.fc->crom_src;
 	root = sc->fd.fc->crom_root;
 
 	bzero(&sc->unit, sizeof(struct crom_chunk));
 
 	crom_add_chunk(src, root, &sc->unit, CROM_UDIR);
 	crom_add_entry(&sc->unit, CSRKEY_SPEC, CSRVAL_VENDOR_PRIVATE);
 	crom_add_simple_text(src, &sc->unit, &sc->spec, "FreeBSD");
 	crom_add_entry(&sc->unit, CSRKEY_VER, DCONS_CSR_VAL_VER);
 	crom_add_simple_text(src, &sc->unit, &sc->ver, "dcons");
 	crom_add_entry(&sc->unit, DCONS_CSR_KEY_HI, ADDR_HI(dcons_paddr));
 	crom_add_entry(&sc->unit, DCONS_CSR_KEY_LO, ADDR_LO(dcons_paddr));
 #if (defined(__i386__) || defined(__amd64__))
 	dcons_crom_expose_idt(sc);
 #endif
 }
 
 static void
 dmamap_cb(void *arg, bus_dma_segment_t *segments, int seg, int error)
 {
 	struct dcons_crom_softc *sc;
 
 	if (error)
 		printf("dcons_dmamap_cb: error=%d\n", error);
 
 	sc = (struct dcons_crom_softc *)arg;
 	sc->bus_addr = segments[0].ds_addr;
 
 	bus_dmamap_sync(sc->dma_tag, sc->dma_map, BUS_DMASYNC_PREWRITE);
 	device_printf(sc->fd.dev,
 	    "bus_addr 0x%jx\n", (uintmax_t)sc->bus_addr);
 	if (dcons_paddr != 0) {
 		/* XXX */
 		device_printf(sc->fd.dev, "dcons_paddr is already set\n");
 		return;
 	}
 	dcons_conf->dma_tag = sc->dma_tag;
 	dcons_conf->dma_map = sc->dma_map;
 	dcons_paddr = sc->bus_addr;
 
 	/* Force to be the high-level console */
 	if (force_console)
 		cnselect(dcons_conf->cdev);
 }
 
 static void
 dcons_crom_poll(void *p, int arg)
 {
 	struct dcons_crom_softc *sc = (struct dcons_crom_softc *) p;
 
 	sc->fd.fc->poll(sc->fd.fc, -1, -1);
 }
 
 static int
 dcons_crom_attach(device_t dev)
 {
 	struct dcons_crom_softc *sc;
 	int error;
 
 	if (dcons_conf->buf == NULL)
 		return (ENXIO);
         sc = (struct dcons_crom_softc *) device_get_softc(dev);
 	sc->fd.fc = device_get_ivars(dev);
 	sc->fd.dev = dev;
 	sc->fd.post_explore = NULL;
 	sc->fd.post_busreset = (void *) dcons_crom_post_busreset;
 
 	/* map dcons buffer */
 	error = bus_dma_tag_create(
 		/*parent*/ sc->fd.fc->dmat,
 		/*alignment*/ sizeof(u_int32_t),
 		/*boundary*/ 0,
 		/*lowaddr*/ BUS_SPACE_MAXADDR,
 		/*highaddr*/ BUS_SPACE_MAXADDR,
 		/*filter*/NULL, /*filterarg*/NULL,
 		/*maxsize*/ dcons_conf->size,
 		/*nsegments*/ 1,
 		/*maxsegsz*/ BUS_SPACE_MAXSIZE_32BIT,
 		/*flags*/ BUS_DMA_ALLOCNOW,
 		/*lockfunc*/busdma_lock_mutex,
 		/*lockarg*/&Giant,
 		&sc->dma_tag);
 	if (error != 0)
 		return (error);
 	error = bus_dmamap_create(sc->dma_tag, BUS_DMA_COHERENT, &sc->dma_map);
 	if (error != 0)
 		return (error);
 	error = bus_dmamap_load(sc->dma_tag, sc->dma_map,
 	    (void *)dcons_conf->buf, dcons_conf->size,
 	    dmamap_cb, sc, 0);
 	if (error != 0)
 		return (error);
 	sc->ehand = EVENTHANDLER_REGISTER(dcons_poll, dcons_crom_poll,
 			 (void *)sc, 0);
 	return (0);
 }
 
 static int
 dcons_crom_detach(device_t dev)
 {
 	struct dcons_crom_softc *sc;
 
         sc = (struct dcons_crom_softc *) device_get_softc(dev);
 	sc->fd.post_busreset = NULL;
 
 	if (sc->ehand)
 		EVENTHANDLER_DEREGISTER(dcons_poll, sc->ehand);
 
 	/* XXX */
 	if (dcons_conf->dma_tag == sc->dma_tag)
 		dcons_conf->dma_tag = NULL;
 
 	bus_dmamap_unload(sc->dma_tag, sc->dma_map);
 	bus_dmamap_destroy(sc->dma_tag, sc->dma_map);
 	bus_dma_tag_destroy(sc->dma_tag);
 
 	return 0;
 }
 
 static devclass_t dcons_crom_devclass;
 
 static device_method_t dcons_crom_methods[] = {
 	/* device interface */
 	DEVMETHOD(device_identify,	dcons_crom_identify),
 	DEVMETHOD(device_probe,		dcons_crom_probe),
 	DEVMETHOD(device_attach,	dcons_crom_attach),
 	DEVMETHOD(device_detach,	dcons_crom_detach),
 	{ 0, 0 }
 };
 
 static driver_t dcons_crom_driver = {
 	"dcons_crom",
 	dcons_crom_methods,
 	sizeof(struct dcons_crom_softc),
 };
 
 DRIVER_MODULE(dcons_crom, firewire, dcons_crom_driver,
 					dcons_crom_devclass, 0, 0);
 MODULE_VERSION(dcons_crom, 1);
 MODULE_DEPEND(dcons_crom, dcons,
 	DCONS_VERSION, DCONS_VERSION, DCONS_VERSION);
 MODULE_DEPEND(dcons_crom, firewire, 1, 1, 1);
Index: head/sys/dev/dcons/dcons_os.c
===================================================================
--- head/sys/dev/dcons/dcons_os.c	(revision 332488)
+++ head/sys/dev/dcons/dcons_os.c	(revision 332489)
@@ -1,490 +1,495 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 2003,2004
  * 	Hidetoshi Shimokawa. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *
  *	This product includes software developed by Hidetoshi Shimokawa.
  *
  * 4. Neither the name of the author nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <gdb/gdb.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/consio.h>
 #include <sys/tty.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/ucred.h>
 
 #include <machine/bus.h>
 
 #include <dev/dcons/dcons.h>
 #include <dev/dcons/dcons_os.h>
 
 #include <ddb/ddb.h>
 #include <sys/reboot.h>
 
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 
 #include "opt_dcons.h"
 #include "opt_kdb.h"
 #include "opt_gdb.h"
 #include "opt_ddb.h"
 
 
 #ifndef DCONS_POLL_HZ
 #define DCONS_POLL_HZ	25
 #endif
 
 #ifndef DCONS_POLL_IDLE
 #define DCONS_POLL_IDLE	256
 #endif
 
 #ifndef DCONS_BUF_SIZE
 #define DCONS_BUF_SIZE (16*1024)
 #endif
 
 #ifndef DCONS_FORCE_CONSOLE
 #define DCONS_FORCE_CONSOLE	0	/* Mostly for FreeBSD-4/DragonFly */
 #endif
 
 #ifndef KLD_MODULE
 static char bssbuf[DCONS_BUF_SIZE];	/* buf in bss */
 #endif
 
 /* global data */
 static struct dcons_global dg;
 struct dcons_global *dcons_conf;
 static int poll_hz = DCONS_POLL_HZ;
 static u_int poll_idle = DCONS_POLL_HZ * DCONS_POLL_IDLE;
 
 static struct dcons_softc sc[DCONS_NPORT];
 
 static SYSCTL_NODE(_kern, OID_AUTO, dcons, CTLFLAG_RD, 0, "Dumb Console");
 SYSCTL_INT(_kern_dcons, OID_AUTO, poll_hz, CTLFLAG_RW, &poll_hz, 0,
 				"dcons polling rate");
 
 static int drv_init = 0;
 static struct callout dcons_callout;
 struct dcons_buf *dcons_buf;		/* for local dconschat */
 
 static void	dcons_timeout(void *);
 static int	dcons_drv_init(int);
 
 static cn_probe_t	dcons_cnprobe;
 static cn_init_t	dcons_cninit;
 static cn_term_t	dcons_cnterm;
 static cn_getc_t	dcons_cngetc;
 static cn_putc_t	dcons_cnputc;
 static cn_grab_t	dcons_cngrab;
 static cn_ungrab_t	dcons_cnungrab;
 
 CONSOLE_DRIVER(dcons);
 
 #if defined(GDB)
 static gdb_probe_f	dcons_dbg_probe;
 static gdb_init_f	dcons_dbg_init;
 static gdb_term_f	dcons_dbg_term;
 static gdb_getc_f	dcons_dbg_getc;
 static gdb_putc_f	dcons_dbg_putc;
 
 GDB_DBGPORT(dcons, dcons_dbg_probe, dcons_dbg_init, dcons_dbg_term,
     dcons_dbg_getc, dcons_dbg_putc);
 
 extern struct gdb_dbgport *gdb_cur;
 #endif
 
 static tsw_outwakeup_t dcons_outwakeup;
 
 static struct ttydevsw dcons_ttydevsw = {
 	.tsw_flags      = TF_NOPREFIX,
 	.tsw_outwakeup  = dcons_outwakeup,
 };
 
 #if (defined(GDB) || defined(DDB))
 static int
 dcons_check_break(struct dcons_softc *dc, int c)
 {
 
 	if (c < 0)
 		return (c);
 
 #ifdef GDB
 	if ((dc->flags & DC_GDB) != 0 && gdb_cur == &dcons_gdb_dbgport)
 		kdb_alt_break_gdb(c, &dc->brk_state);
 	else
 #endif
 		kdb_alt_break(c, &dc->brk_state);
 
 	return (c);
 }
 #else
 #define	dcons_check_break(dc, c)	(c)
 #endif
 
 static int
 dcons_os_checkc_nopoll(struct dcons_softc *dc)
 {
 	int c;
 
 	if (dg.dma_tag != NULL)
 		bus_dmamap_sync(dg.dma_tag, dg.dma_map, BUS_DMASYNC_POSTREAD);
 
 	c = dcons_check_break(dc, dcons_checkc(dc));
 
 	if (dg.dma_tag != NULL)
 		bus_dmamap_sync(dg.dma_tag, dg.dma_map, BUS_DMASYNC_PREREAD);
 
 	return (c);
 }
 
 static int
 dcons_os_checkc(struct dcons_softc *dc)
 {
 	EVENTHANDLER_INVOKE(dcons_poll, 0);
 	return (dcons_os_checkc_nopoll(dc));
 }
 
 static void
 dcons_os_putc(struct dcons_softc *dc, int c)
 {
 	if (dg.dma_tag != NULL)
 		bus_dmamap_sync(dg.dma_tag, dg.dma_map, BUS_DMASYNC_POSTWRITE);
 
 	dcons_putc(dc, c);
 
 	if (dg.dma_tag != NULL)
 		bus_dmamap_sync(dg.dma_tag, dg.dma_map, BUS_DMASYNC_PREWRITE);
 }
 
 static void
 dcons_outwakeup(struct tty *tp)
 {
 	struct dcons_softc *dc;
 	char ch;
 
 	dc = tty_softc(tp);
 
 	while (ttydisc_getc(tp, &ch, sizeof ch) != 0)
 		dcons_os_putc(dc, ch);
 }
 
 static void
 dcons_timeout(void *v)
 {
 	struct	tty *tp;
 	struct dcons_softc *dc;
 	int i, c, polltime;
 
 	for (i = 0; i < DCONS_NPORT; i ++) {
 		dc = &sc[i];
 		tp = dc->tty;
 
 		tty_lock(tp);
 		while ((c = dcons_os_checkc_nopoll(dc)) != -1) {
 			ttydisc_rint(tp, c, 0);
 			poll_idle = 0;
 		}
 		ttydisc_rint_done(tp);
 		tty_unlock(tp);
 	}
 	poll_idle++;
 	polltime = hz;
 	if (poll_idle <= (poll_hz * DCONS_POLL_IDLE))
 		polltime /= poll_hz;
 	callout_reset(&dcons_callout, polltime, dcons_timeout, tp);
 }
 
 static void
 dcons_cnprobe(struct consdev *cp)
 {
 	sprintf(cp->cn_name, "dcons");
 #if DCONS_FORCE_CONSOLE
 	cp->cn_pri = CN_REMOTE;
 #else
 	cp->cn_pri = CN_NORMAL;
 #endif
 }
 
 static void
 dcons_cninit(struct consdev *cp)
 {
 	dcons_drv_init(0);
 	cp->cn_arg = (void *)&sc[DCONS_CON]; /* share port0 with unit0 */
 }
 
 static void
 dcons_cnterm(struct consdev *cp)
 {
 }
 
 static void
 dcons_cngrab(struct consdev *cp)
 {
 }
 
 static void
 dcons_cnungrab(struct consdev *cp)
 {
 }
 
 static int
 dcons_cngetc(struct consdev *cp)
 {
 	struct dcons_softc *dc = (struct dcons_softc *)cp->cn_arg;
 	return (dcons_os_checkc(dc));
 }
 
 static void
 dcons_cnputc(struct consdev *cp, int c)
 {
 	struct dcons_softc *dc = (struct dcons_softc *)cp->cn_arg;
 	dcons_os_putc(dc, c);
 }
 
 static int
 dcons_drv_init(int stage)
 {
 #if defined(__i386__) || defined(__amd64__)
 	quad_t addr, size;
 #endif
 
 	if (drv_init)
 		return(drv_init);
 
 	drv_init = -1;
 
 	bzero(&dg, sizeof(dg));
 	dcons_conf = &dg;
 	dg.cdev = &dcons_consdev;
 	dg.buf = NULL;
 	dg.size = DCONS_BUF_SIZE;
 
 #if defined(__i386__) || defined(__amd64__)
 	if (getenv_quad("dcons.addr", &addr) > 0 &&
 	    getenv_quad("dcons.size", &size) > 0) {
 #ifdef __i386__
 		vm_paddr_t pa;
 		/*
 		 * Allow read/write access to dcons buffer.
 		 */
 		for (pa = trunc_page(addr); pa < addr + size; pa += PAGE_SIZE)
-			*vtopte(KERNBASE + pa) |= PG_RW;
+			*vtopte(PMAP_MAP_LOW + pa) |= PG_RW;
 		invltlb();
 #endif
 		/* XXX P to V */
+#ifdef __amd64__
 		dg.buf = (struct dcons_buf *)(vm_offset_t)(KERNBASE + addr);
+#else /* __i386__ */
+		dg.buf = (struct dcons_buf *)((vm_offset_t)PMAP_MAP_LOW +
+		    addr);
+#endif
 		dg.size = size;
 		if (dcons_load_buffer(dg.buf, dg.size, sc) < 0)
 			dg.buf = NULL;
 	}
 #endif
 	if (dg.buf != NULL)
 		goto ok;
 
 #ifndef KLD_MODULE
 	if (stage == 0) { /* XXX or cold */
 		/*
 		 * DCONS_FORCE_CONSOLE == 1 and statically linked.
 		 * called from cninit(). can't use contigmalloc yet .
 		 */
 		dg.buf = (struct dcons_buf *) bssbuf;
 		dcons_init(dg.buf, dg.size, sc);
 	} else
 #endif
 	{
 		/*
 		 * DCONS_FORCE_CONSOLE == 0 or kernel module case.
 		 * if the module is loaded after boot,
 		 * bssbuf could be non-continuous.
 		 */ 
 		dg.buf = (struct dcons_buf *) contigmalloc(dg.size,
 			M_DEVBUF, 0, 0x10000, 0xffffffff, PAGE_SIZE, 0ul);
 		if (dg.buf == NULL)
 			return (-1);
 		dcons_init(dg.buf, dg.size, sc);
 	}
 
 ok:
 	dcons_buf = dg.buf;
 
 	drv_init = 1;
 
 	return 0;
 }
 
 
 static int
 dcons_attach_port(int port, char *name, int flags)
 {
 	struct dcons_softc *dc;
 	struct tty *tp;
 
 	dc = &sc[port];
 	tp = tty_alloc(&dcons_ttydevsw, dc);
 	dc->flags = flags;
 	dc->tty   = tp;
 	tty_init_console(tp, 0);
 	tty_makedev(tp, NULL, "%s", name);
 	return(0);
 }
 
 static int
 dcons_attach(void)
 {
 	int polltime;
 
 	dcons_attach_port(DCONS_CON, "dcons", 0);
 	dcons_attach_port(DCONS_GDB, "dgdb", DC_GDB);
 	callout_init(&dcons_callout, 1);
 	polltime = hz / poll_hz;
 	callout_reset(&dcons_callout, polltime, dcons_timeout, NULL);
 	return(0);
 }
 
 static int
 dcons_detach(int port)
 {
 	struct	tty *tp;
 	struct dcons_softc *dc;
 
 	dc = &sc[port];
 	tp = dc->tty;
 
 	tty_lock(tp);
 	tty_rel_gone(tp);
 
 	return(0);
 }
 
 static int
 dcons_modevent(module_t mode, int type, void *data)
 {
 	int err = 0, ret;
 
 	switch (type) {
 	case MOD_LOAD:
 		ret = dcons_drv_init(1);
 		if (ret != -1)
 			dcons_attach();
 		if (ret == 0) {
 			dcons_cnprobe(&dcons_consdev);
 			dcons_cninit(&dcons_consdev);
 			cnadd(&dcons_consdev);
 		}
 		break;
 	case MOD_UNLOAD:
 		printf("dcons: unload\n");
 		if (drv_init == 1) {
 			callout_stop(&dcons_callout);
 			cnremove(&dcons_consdev);
 			dcons_detach(DCONS_CON);
 			dcons_detach(DCONS_GDB);
 			dg.buf->magic = 0;
 
 			contigfree(dg.buf, DCONS_BUF_SIZE, M_DEVBUF);
 		}
 
 		break;
 	case MOD_SHUTDOWN:
 #if 0		/* Keep connection after halt */
 		dg.buf->magic = 0;
 #endif
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return(err);
 }
 
 #if defined(GDB)
 /* Debugger interface */
 
 static int
 dcons_os_getc(struct dcons_softc *dc)
 {
 	int c;
 
 	while ((c = dcons_os_checkc(dc)) == -1);
 
 	return (c & 0xff);
 }
 
 static int
 dcons_dbg_probe(void)
 {
 	int dcons_gdb;
 
 	if (getenv_int("dcons_gdb", &dcons_gdb) == 0)
 		return (-1);
 	return (dcons_gdb);
 }
 
 static void
 dcons_dbg_init(void)
 {
 }
 
 static void
 dcons_dbg_term(void)
 {
 }
 
 static void
 dcons_dbg_putc(int c)
 {
 	struct dcons_softc *dc = &sc[DCONS_GDB];
 	dcons_os_putc(dc, c);
 }
 
 static int
 dcons_dbg_getc(void)
 {
 	struct dcons_softc *dc = &sc[DCONS_GDB];
 	return (dcons_os_getc(dc));
 }
 #endif
 
 DEV_MODULE(dcons, dcons_modevent, NULL);
 MODULE_VERSION(dcons, DCONS_VERSION);
Index: head/sys/dev/hyperv/vmbus/i386/vmbus_vector.S
===================================================================
--- head/sys/dev/hyperv/vmbus/i386/vmbus_vector.S	(revision 332488)
+++ head/sys/dev/hyperv/vmbus/i386/vmbus_vector.S	(revision 332489)
@@ -1,50 +1,52 @@
 /*-
  * Copyright (c) 2016 Microsoft Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
+#include "assym.inc"
+
+#include <machine/psl.h>
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 
-#include "assym.inc"
-
 /*
  * This is the Hyper-V vmbus channel direct callback interrupt.
  * Only used when it is running on Hyper-V.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(vmbus_isr_pti)
 IDTVEC(vmbus_isr)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
+	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
 	call	vmbus_handle_intr
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
Index: head/sys/dev/ppc/ppc.c
===================================================================
--- head/sys/dev/ppc/ppc.c	(revision 332488)
+++ head/sys/dev/ppc/ppc.c	(revision 332489)
@@ -1,2006 +1,2007 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997-2000 Nicolas Souchu
  * Copyright (c) 2001 Alcove - Nicolas Souchu
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ppc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/interrupt.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/rman.h>
 
 #ifdef __i386__
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <machine/vmparam.h>
+#include <machine/pc/bios.h>
 #endif
 
 #include <dev/ppbus/ppbconf.h>
 #include <dev/ppbus/ppb_msq.h>
 
 #include <dev/ppc/ppcvar.h>
 #include <dev/ppc/ppcreg.h>
 
 #include "ppbus_if.h"
 
 static void ppcintr(void *arg);
 
 #define	IO_LPTSIZE_EXTENDED	8	/* "Extended" LPT controllers */
 #define	IO_LPTSIZE_NORMAL	4	/* "Normal" LPT controllers */
 
 #define LOG_PPC(function, ppc, string) \
 		if (bootverbose) printf("%s: %s\n", function, string)
 
 #define DEVTOSOFTC(dev) ((struct ppc_data *)device_get_softc(dev))
 
 /*
  * We use critical enter/exit for the simple config locking needed to
  * detect the devices. We just want to make sure that both of our writes
  * happen without someone else also writing to those config registers. Since
  * we just do this at startup, Giant keeps multiple threads from executing,
  * and critical_enter() then is all that's needed to keep us from being preempted
  * during the critical sequences with the hardware.
  *
  * Note: this doesn't prevent multiple threads from putting the chips into
  * config mode, but since we only do that to detect the type at startup the
  * extra overhead isn't needed since Giant protects us from multiple entry
  * and no other code changes these registers.
  */
 #define PPC_CONFIG_LOCK(ppc)		critical_enter()
 #define PPC_CONFIG_UNLOCK(ppc)		critical_exit()
 
 devclass_t ppc_devclass;
 const char ppc_driver_name[] = "ppc";
 
 static char *ppc_models[] = {
 	"SMC-like", "SMC FDC37C665GT", "SMC FDC37C666GT", "PC87332", "PC87306",
 	"82091AA", "Generic", "W83877F", "W83877AF", "Winbond", "PC87334",
 	"SMC FDC37C935", "PC87303", 0
 };
 
 /* list of available modes */
 static char *ppc_avms[] = {
 	"COMPATIBLE", "NIBBLE-only", "PS2-only", "PS2/NIBBLE", "EPP-only",
 	"EPP/NIBBLE", "EPP/PS2", "EPP/PS2/NIBBLE", "ECP-only",
 	"ECP/NIBBLE", "ECP/PS2", "ECP/PS2/NIBBLE", "ECP/EPP",
 	"ECP/EPP/NIBBLE", "ECP/EPP/PS2", "ECP/EPP/PS2/NIBBLE", 0
 };
 
 /* list of current executing modes
  * Note that few modes do not actually exist.
  */
 static char *ppc_modes[] = {
 	"COMPATIBLE", "NIBBLE", "PS/2", "PS/2", "EPP",
 	"EPP", "EPP", "EPP", "ECP",
 	"ECP", "ECP+PS2", "ECP+PS2", "ECP+EPP",
 	"ECP+EPP", "ECP+EPP", "ECP+EPP", 0
 };
 
 static char *ppc_epp_protocol[] = { " (EPP 1.9)", " (EPP 1.7)", 0 };
 
 #ifdef __i386__
 /*
  * BIOS printer list - used by BIOS probe.
  */
 #define	BIOS_PPC_PORTS	0x408
-#define	BIOS_PORTS	(short *)(KERNBASE+BIOS_PPC_PORTS)
+#define	BIOS_PORTS	((short *)BIOS_PADDRTOVADDR(BIOS_PPC_PORTS))
 #define	BIOS_MAX_PPC	4
 #endif
 
 /*
  * ppc_ecp_sync()		XXX
  */
 int
 ppc_ecp_sync(device_t dev)
 {
 	int i, r;
 	struct ppc_data *ppc = DEVTOSOFTC(dev);
 
 	PPC_ASSERT_LOCKED(ppc);
 	if (!(ppc->ppc_avm & PPB_ECP) && !(ppc->ppc_dtm & PPB_ECP))
 		return 0;
 
 	r = r_ecr(ppc);
 	if ((r & 0xe0) != PPC_ECR_EPP)
 		return 0;
 
 	for (i = 0; i < 100; i++) {
 		r = r_ecr(ppc);
 		if (r & 0x1)
 			return 0;
 		DELAY(100);
 	}
 
 	device_printf(dev, "ECP sync failed as data still present in FIFO.\n");
 
 	return 0;
 }
 
 /*
  * ppc_detect_fifo()
  *
  * Detect parallel port FIFO
  */
 static int
 ppc_detect_fifo(struct ppc_data *ppc)
 {
 	char ecr_sav;
 	char ctr_sav, ctr, cc;
 	short i;
 
 	/* save registers */
 	ecr_sav = r_ecr(ppc);
 	ctr_sav = r_ctr(ppc);
 
 	/* enter ECP configuration mode, no interrupt, no DMA */
 	w_ecr(ppc, 0xf4);
 
 	/* read PWord size - transfers in FIFO mode must be PWord aligned */
 	ppc->ppc_pword = (r_cnfgA(ppc) & PPC_PWORD_MASK);
 
 	/* XXX 16 and 32 bits implementations not supported */
 	if (ppc->ppc_pword != PPC_PWORD_8) {
 		LOG_PPC(__func__, ppc, "PWord not supported");
 		goto error;
 	}
 
 	w_ecr(ppc, 0x34);		/* byte mode, no interrupt, no DMA */
 	ctr = r_ctr(ppc);
 	w_ctr(ppc, ctr | PCD);		/* set direction to 1 */
 
 	/* enter ECP test mode, no interrupt, no DMA */
 	w_ecr(ppc, 0xd4);
 
 	/* flush the FIFO */
 	for (i=0; i<1024; i++) {
 		if (r_ecr(ppc) & PPC_FIFO_EMPTY)
 			break;
 		cc = r_fifo(ppc);
 	}
 
 	if (i >= 1024) {
 		LOG_PPC(__func__, ppc, "can't flush FIFO");
 		goto error;
 	}
 
 	/* enable interrupts, no DMA */
 	w_ecr(ppc, 0xd0);
 
 	/* determine readIntrThreshold
 	 * fill the FIFO until serviceIntr is set
 	 */
 	for (i=0; i<1024; i++) {
 		w_fifo(ppc, (char)i);
 		if (!ppc->ppc_rthr && (r_ecr(ppc) & PPC_SERVICE_INTR)) {
 			/* readThreshold reached */
 			ppc->ppc_rthr = i+1;
 		}
 		if (r_ecr(ppc) & PPC_FIFO_FULL) {
 			ppc->ppc_fifo = i+1;
 			break;
 		}
 	}
 
 	if (i >= 1024) {
 		LOG_PPC(__func__, ppc, "can't fill FIFO");
 		goto error;
 	}
 
 	w_ecr(ppc, 0xd4);		/* test mode, no interrupt, no DMA */
 	w_ctr(ppc, ctr & ~PCD);		/* set direction to 0 */
 	w_ecr(ppc, 0xd0);		/* enable interrupts */
 
 	/* determine writeIntrThreshold
 	 * empty the FIFO until serviceIntr is set
 	 */
 	for (i=ppc->ppc_fifo; i>0; i--) {
 		if (r_fifo(ppc) != (char)(ppc->ppc_fifo-i)) {
 			LOG_PPC(__func__, ppc, "invalid data in FIFO");
 			goto error;
 		}
 		if (r_ecr(ppc) & PPC_SERVICE_INTR) {
 			/* writeIntrThreshold reached */
 			ppc->ppc_wthr = ppc->ppc_fifo - i+1;
 		}
 		/* if FIFO empty before the last byte, error */
 		if (i>1 && (r_ecr(ppc) & PPC_FIFO_EMPTY)) {
 			LOG_PPC(__func__, ppc, "data lost in FIFO");
 			goto error;
 		}
 	}
 
 	/* FIFO must be empty after the last byte */
 	if (!(r_ecr(ppc) & PPC_FIFO_EMPTY)) {
 		LOG_PPC(__func__, ppc, "can't empty the FIFO");
 		goto error;
 	}
 
 	w_ctr(ppc, ctr_sav);
 	w_ecr(ppc, ecr_sav);
 
 	return (0);
 
 error:
 	w_ctr(ppc, ctr_sav);
 	w_ecr(ppc, ecr_sav);
 
 	return (EINVAL);
 }
 
 static int
 ppc_detect_port(struct ppc_data *ppc)
 {
 
 	w_ctr(ppc, 0x0c);	/* To avoid missing PS2 ports */
 	w_dtr(ppc, 0xaa);
 	if (r_dtr(ppc) != 0xaa)
 		return (0);
 
 	return (1);
 }
 
 /*
  * EPP timeout, according to the PC87332 manual
  * Semantics of clearing EPP timeout bit.
  * PC87332	- reading SPP_STR does it...
  * SMC		- write 1 to EPP timeout bit			XXX
  * Others	- (?) write 0 to EPP timeout bit
  */
 static void
 ppc_reset_epp_timeout(struct ppc_data *ppc)
 {
 	char r;
 
 	r = r_str(ppc);
 	w_str(ppc, r | 0x1);
 	w_str(ppc, r & 0xfe);
 
 	return;
 }
 
 static int
 ppc_check_epp_timeout(struct ppc_data *ppc)
 {
 	ppc_reset_epp_timeout(ppc);
 
 	return (!(r_str(ppc) & TIMEOUT));
 }
 
 /*
  * Configure current operating mode
  */
 static int
 ppc_generic_setmode(struct ppc_data *ppc, int mode)
 {
 	u_char ecr = 0;
 
 	/* check if mode is available */
 	if (mode && !(ppc->ppc_avm & mode))
 		return (EINVAL);
 
 	/* if ECP mode, configure ecr register */
 	if ((ppc->ppc_avm & PPB_ECP) || (ppc->ppc_dtm & PPB_ECP)) {
 		/* return to byte mode (keeping direction bit),
 		 * no interrupt, no DMA to be able to change to
 		 * ECP
 		 */
 		w_ecr(ppc, PPC_ECR_RESET);
 		ecr = PPC_DISABLE_INTR;
 
 		if (mode & PPB_EPP)
 			return (EINVAL);
 		else if (mode & PPB_ECP)
 			/* select ECP mode */
 			ecr |= PPC_ECR_ECP;
 		else if (mode & PPB_PS2)
 			/* select PS2 mode with ECP */
 			ecr |= PPC_ECR_PS2;
 		else
 			/* select COMPATIBLE/NIBBLE mode */
 			ecr |= PPC_ECR_STD;
 
 		w_ecr(ppc, ecr);
 	}
 
 	ppc->ppc_mode = mode;
 
 	return (0);
 }
 
 /*
  * The ppc driver is free to choose options like FIFO or DMA
  * if ECP mode is available.
  *
  * The 'RAW' option allows the upper drivers to force the ppc mode
  * even with FIFO, DMA available.
  */
 static int
 ppc_smclike_setmode(struct ppc_data *ppc, int mode)
 {
 	u_char ecr = 0;
 
 	/* check if mode is available */
 	if (mode && !(ppc->ppc_avm & mode))
 		return (EINVAL);
 
 	/* if ECP mode, configure ecr register */
 	if ((ppc->ppc_avm & PPB_ECP) || (ppc->ppc_dtm & PPB_ECP)) {
 		/* return to byte mode (keeping direction bit),
 		 * no interrupt, no DMA to be able to change to
 		 * ECP or EPP mode
 		 */
 		w_ecr(ppc, PPC_ECR_RESET);
 		ecr = PPC_DISABLE_INTR;
 
 		if (mode & PPB_EPP)
 			/* select EPP mode */
 			ecr |= PPC_ECR_EPP;
 		else if (mode & PPB_ECP)
 			/* select ECP mode */
 			ecr |= PPC_ECR_ECP;
 		else if (mode & PPB_PS2)
 			/* select PS2 mode with ECP */
 			ecr |= PPC_ECR_PS2;
 		else
 			/* select COMPATIBLE/NIBBLE mode */
 			ecr |= PPC_ECR_STD;
 
 		w_ecr(ppc, ecr);
 	}
 
 	ppc->ppc_mode = mode;
 
 	return (0);
 }
 
 #ifdef PPC_PROBE_CHIPSET
 /*
  * ppc_pc873xx_detect
  *
  * Probe for a Natsemi PC873xx-family part.
  *
  * References in this function are to the National Semiconductor
  * PC87332 datasheet TL/C/11930, May 1995 revision.
  */
 static int pc873xx_basetab[] = {0x0398, 0x026e, 0x015c, 0x002e, 0};
 static int pc873xx_porttab[] = {0x0378, 0x03bc, 0x0278, 0};
 static int pc873xx_irqtab[] = {5, 7, 5, 0};
 
 static int pc873xx_regstab[] = {
 	PC873_FER, PC873_FAR, PC873_PTR,
 	PC873_FCR, PC873_PCR, PC873_PMC,
 	PC873_TUP, PC873_SID, PC873_PNP0,
 	PC873_PNP1, PC873_LPTBA, -1
 };
 
 static char *pc873xx_rnametab[] = {
 	"FER", "FAR", "PTR", "FCR", "PCR",
 	"PMC", "TUP", "SID", "PNP0", "PNP1",
 	"LPTBA", NULL
 };
 
 static int
 ppc_pc873xx_detect(struct ppc_data *ppc, int chipset_mode)	/* XXX mode never forced */
 {
     static int	index = 0;
     int		idport, irq;
     int		ptr, pcr, val, i;
 
     while ((idport = pc873xx_basetab[index++])) {
 
 	/* XXX should check first to see if this location is already claimed */
 
 	/*
 	 * Pull the 873xx through the power-on ID cycle (2.2,1.).
 	 * We can't use this to locate the chip as it may already have
 	 * been used by the BIOS.
 	 */
 	(void)inb(idport); (void)inb(idport);
 	(void)inb(idport); (void)inb(idport);
 
 	/*
 	 * Read the SID byte.  Possible values are :
 	 *
 	 * 01010xxx	PC87334
 	 * 0001xxxx	PC87332
 	 * 01110xxx	PC87306
 	 * 00110xxx	PC87303
 	 */
 	outb(idport, PC873_SID);
 	val = inb(idport + 1);
 	if ((val & 0xf0) == 0x10) {
 	    ppc->ppc_model = NS_PC87332;
 	} else if ((val & 0xf8) == 0x70) {
 	    ppc->ppc_model = NS_PC87306;
 	} else if ((val & 0xf8) == 0x50) {
 	    ppc->ppc_model = NS_PC87334;
 	} else if ((val & 0xf8) == 0x40) { /* Should be 0x30 by the
 					      documentation, but probing
 					      yielded 0x40... */
 	    ppc->ppc_model = NS_PC87303;
 	} else {
 	    if (bootverbose && (val != 0xff))
 		printf("PC873xx probe at 0x%x got unknown ID 0x%x\n", idport, val);
 	    continue ;		/* not recognised */
 	}
 
 	/* print registers */
 	if (bootverbose) {
 		printf("PC873xx");
 		for (i=0; pc873xx_regstab[i] != -1; i++) {
 			outb(idport, pc873xx_regstab[i]);
 			printf(" %s=0x%x", pc873xx_rnametab[i],
 						inb(idport + 1) & 0xff);
 		}
 		printf("\n");
 	}
 
 	/*
 	 * We think we have one.  Is it enabled and where we want it to be?
 	 */
 	outb(idport, PC873_FER);
 	val = inb(idport + 1);
 	if (!(val & PC873_PPENABLE)) {
 	    if (bootverbose)
 		printf("PC873xx parallel port disabled\n");
 	    continue;
 	}
 	outb(idport, PC873_FAR);
 	val = inb(idport + 1);
 	/* XXX we should create a driver instance for every port found */
 	if (pc873xx_porttab[val & 0x3] != ppc->ppc_base) {
 
 	    /* First try to change the port address to that requested... */
 
 	    switch (ppc->ppc_base) {
 		case 0x378:
 		val &= 0xfc;
 		break;
 
 		case 0x3bc:
 		val &= 0xfd;
 		break;
 
 		case 0x278:
 		val &= 0xfe;
 		break;
 
 		default:
 		val &= 0xfd;
 		break;
 	    }
 
 	    outb(idport, PC873_FAR);
 	    outb(idport + 1, val);
 	    outb(idport + 1, val);
 
 	    /* Check for success by reading back the value we supposedly
 	       wrote and comparing...*/
 
 	    outb(idport, PC873_FAR);
 	    val = inb(idport + 1) & 0x3;
 
 	    /* If we fail, report the failure... */
 
 	    if (pc873xx_porttab[val] != ppc->ppc_base) {
  		if (bootverbose)
 	  	    printf("PC873xx at 0x%x not for driver at port 0x%x\n",
 			   pc873xx_porttab[val], ppc->ppc_base);
 	    }
 	    continue;
 	}
 
 	outb(idport, PC873_PTR);
 	ptr = inb(idport + 1);
 
 	/* get irq settings */
 	if (ppc->ppc_base == 0x378)
 		irq = (ptr & PC873_LPTBIRQ7) ? 7 : 5;
 	else
 		irq = pc873xx_irqtab[val];
 
 	if (bootverbose)
 		printf("PC873xx irq %d at 0x%x\n", irq, ppc->ppc_base);
 
 	/*
 	 * Check if irq settings are correct
 	 */
 	if (irq != ppc->ppc_irq) {
 		/*
 		 * If the chipset is not locked and base address is 0x378,
 		 * we have another chance
 		 */
 		if (ppc->ppc_base == 0x378 && !(ptr & PC873_CFGLOCK)) {
 			if (ppc->ppc_irq == 7) {
 				outb(idport + 1, (ptr | PC873_LPTBIRQ7));
 				outb(idport + 1, (ptr | PC873_LPTBIRQ7));
 			} else {
 				outb(idport + 1, (ptr & ~PC873_LPTBIRQ7));
 				outb(idport + 1, (ptr & ~PC873_LPTBIRQ7));
 			}
 			if (bootverbose)
 			   printf("PC873xx irq set to %d\n", ppc->ppc_irq);
 		} else {
 			if (bootverbose)
 			   printf("PC873xx sorry, can't change irq setting\n");
 		}
 	} else {
 		if (bootverbose)
 			printf("PC873xx irq settings are correct\n");
 	}
 
 	outb(idport, PC873_PCR);
 	pcr = inb(idport + 1);
 
 	if ((ptr & PC873_CFGLOCK) || !chipset_mode) {
 	    if (bootverbose)
 		printf("PC873xx %s", (ptr & PC873_CFGLOCK)?"locked":"unlocked");
 
 	    ppc->ppc_avm |= PPB_NIBBLE;
 	    if (bootverbose)
 		printf(", NIBBLE");
 
 	    if (pcr & PC873_EPPEN) {
 		ppc->ppc_avm |= PPB_EPP;
 
 		if (bootverbose)
 			printf(", EPP");
 
 		if (pcr & PC873_EPP19)
 			ppc->ppc_epp = EPP_1_9;
 		else
 			ppc->ppc_epp = EPP_1_7;
 
 		if ((ppc->ppc_model == NS_PC87332) && bootverbose) {
 			outb(idport, PC873_PTR);
 			ptr = inb(idport + 1);
 			if (ptr & PC873_EPPRDIR)
 				printf(", Regular mode");
 			else
 				printf(", Automatic mode");
 		}
 	    } else if (pcr & PC873_ECPEN) {
 		ppc->ppc_avm |= PPB_ECP;
 		if (bootverbose)
 			printf(", ECP");
 
 		if (pcr & PC873_ECPCLK)	{		/* XXX */
 			ppc->ppc_avm |= PPB_PS2;
 			if (bootverbose)
 				printf(", PS/2");
 		}
 	    } else {
 		outb(idport, PC873_PTR);
 		ptr = inb(idport + 1);
 		if (ptr & PC873_EXTENDED) {
 			ppc->ppc_avm |= PPB_SPP;
 			if (bootverbose)
 				printf(", SPP");
 		}
 	    }
 	} else {
 		if (bootverbose)
 			printf("PC873xx unlocked");
 
 		if (chipset_mode & PPB_ECP) {
 			if ((chipset_mode & PPB_EPP) && bootverbose)
 				printf(", ECP+EPP not supported");
 
 			pcr &= ~PC873_EPPEN;
 			pcr |= (PC873_ECPEN | PC873_ECPCLK);	/* XXX */
 			outb(idport + 1, pcr);
 			outb(idport + 1, pcr);
 
 			if (bootverbose)
 				printf(", ECP");
 
 		} else if (chipset_mode & PPB_EPP) {
 			pcr &= ~(PC873_ECPEN | PC873_ECPCLK);
 			pcr |= (PC873_EPPEN | PC873_EPP19);
 			outb(idport + 1, pcr);
 			outb(idport + 1, pcr);
 
 			ppc->ppc_epp = EPP_1_9;			/* XXX */
 
 			if (bootverbose)
 				printf(", EPP1.9");
 
 			/* enable automatic direction turnover */
 			if (ppc->ppc_model == NS_PC87332) {
 				outb(idport, PC873_PTR);
 				ptr = inb(idport + 1);
 				ptr &= ~PC873_EPPRDIR;
 				outb(idport + 1, ptr);
 				outb(idport + 1, ptr);
 
 				if (bootverbose)
 					printf(", Automatic mode");
 			}
 		} else {
 			pcr &= ~(PC873_ECPEN | PC873_ECPCLK | PC873_EPPEN);
 			outb(idport + 1, pcr);
 			outb(idport + 1, pcr);
 
 			/* configure extended bit in PTR */
 			outb(idport, PC873_PTR);
 			ptr = inb(idport + 1);
 
 			if (chipset_mode & PPB_PS2) {
 				ptr |= PC873_EXTENDED;
 
 				if (bootverbose)
 					printf(", PS/2");
 
 			} else {
 				/* default to NIBBLE mode */
 				ptr &= ~PC873_EXTENDED;
 
 				if (bootverbose)
 					printf(", NIBBLE");
 			}
 			outb(idport + 1, ptr);
 			outb(idport + 1, ptr);
 		}
 
 		ppc->ppc_avm = chipset_mode;
 	}
 
 	if (bootverbose)
 		printf("\n");
 
 	ppc->ppc_type = PPC_TYPE_GENERIC;
 	ppc_generic_setmode(ppc, chipset_mode);
 
 	return(chipset_mode);
     }
     return(-1);
 }
 
 /*
  * ppc_smc37c66xgt_detect
  *
  * SMC FDC37C66xGT configuration.
  */
 static int
 ppc_smc37c66xgt_detect(struct ppc_data *ppc, int chipset_mode)
 {
 	int i;
 	u_char r;
 	int type = -1;
 	int csr = SMC66x_CSR;	/* initial value is 0x3F0 */
 
 	int port_address[] = { -1 /* disabled */ , 0x3bc, 0x378, 0x278 };
 
 
 #define cio csr+1	/* config IO port is either 0x3F1 or 0x371 */
 
 	/*
 	 * Detection: enter configuration mode and read CRD register.
 	 */
 	PPC_CONFIG_LOCK(ppc);
 	outb(csr, SMC665_iCODE);
 	outb(csr, SMC665_iCODE);
 	PPC_CONFIG_UNLOCK(ppc);
 
 	outb(csr, 0xd);
 	if (inb(cio) == 0x65) {
 		type = SMC_37C665GT;
 		goto config;
 	}
 
 	for (i = 0; i < 2; i++) {
 		PPC_CONFIG_LOCK(ppc);
 		outb(csr, SMC666_iCODE);
 		outb(csr, SMC666_iCODE);
 		PPC_CONFIG_UNLOCK(ppc);
 
 		outb(csr, 0xd);
 		if (inb(cio) == 0x66) {
 			type = SMC_37C666GT;
 			break;
 		}
 
 		/* Another chance, CSR may be hard-configured to be at 0x370 */
 		csr = SMC666_CSR;
 	}
 
 config:
 	/*
 	 * If chipset not found, do not continue.
 	 */
 	if (type == -1) {
 		outb(csr, 0xaa);	/* end config mode */
 		return (-1);
 	}
 
 	/* select CR1 */
 	outb(csr, 0x1);
 
 	/* read the port's address: bits 0 and 1 of CR1 */
 	r = inb(cio) & SMC_CR1_ADDR;
 	if (port_address[(int)r] != ppc->ppc_base) {
 		outb(csr, 0xaa);	/* end config mode */
 		return (-1);
 	}
 
 	ppc->ppc_model = type;
 
 	/*
 	 * CR1 and CR4 registers bits 3 and 0/1 for mode configuration
 	 * If SPP mode is detected, try to set ECP+EPP mode
 	 */
 
 	if (bootverbose) {
 		outb(csr, 0x1);
 		device_printf(ppc->ppc_dev, "SMC registers CR1=0x%x",
 		    inb(cio) & 0xff);
 
 		outb(csr, 0x4);
 		printf(" CR4=0x%x", inb(cio) & 0xff);
 	}
 
 	/* select CR1 */
 	outb(csr, 0x1);
 
 	if (!chipset_mode) {
 		/* autodetect mode */
 
 		/* 666GT is ~certainly~ hardwired to an extended ECP+EPP mode */
 		if (type == SMC_37C666GT) {
 			ppc->ppc_avm |= PPB_ECP | PPB_EPP | PPB_SPP;
 			if (bootverbose)
 				printf(" configuration hardwired, supposing " \
 					"ECP+EPP SPP");
 
 		} else
 		   if ((inb(cio) & SMC_CR1_MODE) == 0) {
 			/* already in extended parallel port mode, read CR4 */
 			outb(csr, 0x4);
 			r = (inb(cio) & SMC_CR4_EMODE);
 
 			switch (r) {
 			case SMC_SPP:
 				ppc->ppc_avm |= PPB_SPP;
 				if (bootverbose)
 					printf(" SPP");
 				break;
 
 			case SMC_EPPSPP:
 				ppc->ppc_avm |= PPB_EPP | PPB_SPP;
 				if (bootverbose)
 					printf(" EPP SPP");
 				break;
 
 			case SMC_ECP:
 				ppc->ppc_avm |= PPB_ECP | PPB_SPP;
 				if (bootverbose)
 					printf(" ECP SPP");
 				break;
 
 			case SMC_ECPEPP:
 				ppc->ppc_avm |= PPB_ECP | PPB_EPP | PPB_SPP;
 				if (bootverbose)
 					printf(" ECP+EPP SPP");
 				break;
 			}
 		   } else {
 			/* not an extended port mode */
 			ppc->ppc_avm |= PPB_SPP;
 			if (bootverbose)
 				printf(" SPP");
 		   }
 
 	} else {
 		/* mode forced */
 		ppc->ppc_avm = chipset_mode;
 
 		/* 666GT is ~certainly~ hardwired to an extended ECP+EPP mode */
 		if (type == SMC_37C666GT)
 			goto end_detect;
 
 		r = inb(cio);
 		if ((chipset_mode & (PPB_ECP | PPB_EPP)) == 0) {
 			/* do not use ECP when the mode is not forced to */
 			outb(cio, r | SMC_CR1_MODE);
 			if (bootverbose)
 				printf(" SPP");
 		} else {
 			/* an extended mode is selected */
 			outb(cio, r & ~SMC_CR1_MODE);
 
 			/* read CR4 register and reset mode field */
 			outb(csr, 0x4);
 			r = inb(cio) & ~SMC_CR4_EMODE;
 
 			if (chipset_mode & PPB_ECP) {
 				if (chipset_mode & PPB_EPP) {
 					outb(cio, r | SMC_ECPEPP);
 					if (bootverbose)
 						printf(" ECP+EPP");
 				} else {
 					outb(cio, r | SMC_ECP);
 					if (bootverbose)
 						printf(" ECP");
 				}
 			} else {
 				/* PPB_EPP is set */
 				outb(cio, r | SMC_EPPSPP);
 				if (bootverbose)
 					printf(" EPP SPP");
 			}
 		}
 		ppc->ppc_avm = chipset_mode;
 	}
 
 	/* set FIFO threshold to 16 */
 	if (ppc->ppc_avm & PPB_ECP) {
 		/* select CRA */
 		outb(csr, 0xa);
 		outb(cio, 16);
 	}
 
 end_detect:
 
 	if (bootverbose)
 		printf ("\n");
 
 	if (ppc->ppc_avm & PPB_EPP) {
 		/* select CR4 */
 		outb(csr, 0x4);
 		r = inb(cio);
 
 		/*
 		 * Set the EPP protocol...
 		 * Low=EPP 1.9 (1284 standard) and High=EPP 1.7
 		 */
 		if (ppc->ppc_epp == EPP_1_9)
 			outb(cio, (r & ~SMC_CR4_EPPTYPE));
 		else
 			outb(cio, (r | SMC_CR4_EPPTYPE));
 	}
 
 	outb(csr, 0xaa);	/* end config mode */
 
 	ppc->ppc_type = PPC_TYPE_SMCLIKE;
 	ppc_smclike_setmode(ppc, chipset_mode);
 
 	return (chipset_mode);
 }
 
 /*
  * SMC FDC37C935 configuration
  * Found on many Alpha machines
  */
 static int
 ppc_smc37c935_detect(struct ppc_data *ppc, int chipset_mode)
 {
 	int type = -1;
 
 	PPC_CONFIG_LOCK(ppc);
 	outb(SMC935_CFG, 0x55); /* enter config mode */
 	outb(SMC935_CFG, 0x55);
 	PPC_CONFIG_UNLOCK(ppc);
 
 	outb(SMC935_IND, SMC935_ID); /* check device id */
 	if (inb(SMC935_DAT) == 0x2)
 		type = SMC_37C935;
 
 	if (type == -1) {
 		outb(SMC935_CFG, 0xaa); /* exit config mode */
 		return (-1);
 	}
 
 	ppc->ppc_model = type;
 
 	outb(SMC935_IND, SMC935_LOGDEV); /* select parallel port, */
 	outb(SMC935_DAT, 3);	     /* which is logical device 3 */
 
 	/* set io port base */
 	outb(SMC935_IND, SMC935_PORTHI);
 	outb(SMC935_DAT, (u_char)((ppc->ppc_base & 0xff00) >> 8));
 	outb(SMC935_IND, SMC935_PORTLO);
 	outb(SMC935_DAT, (u_char)(ppc->ppc_base & 0xff));
 
 	if (!chipset_mode)
 		ppc->ppc_avm = PPB_COMPATIBLE; /* default mode */
 	else {
 		ppc->ppc_avm = chipset_mode;
 		outb(SMC935_IND, SMC935_PPMODE);
 		outb(SMC935_DAT, SMC935_CENT); /* start in compatible mode */
 
 		/* SPP + EPP or just plain SPP */
 		if (chipset_mode & (PPB_SPP)) {
 			if (chipset_mode & PPB_EPP) {
 				if (ppc->ppc_epp == EPP_1_9) {
 					outb(SMC935_IND, SMC935_PPMODE);
 					outb(SMC935_DAT, SMC935_EPP19SPP);
 				}
 				if (ppc->ppc_epp == EPP_1_7) {
 					outb(SMC935_IND, SMC935_PPMODE);
 					outb(SMC935_DAT, SMC935_EPP17SPP);
 				}
 			} else {
 				outb(SMC935_IND, SMC935_PPMODE);
 				outb(SMC935_DAT, SMC935_SPP);
 			}
 		}
 
 		/* ECP + EPP or just plain ECP */
 		if (chipset_mode & PPB_ECP) {
 			if (chipset_mode & PPB_EPP) {
 				if (ppc->ppc_epp == EPP_1_9) {
 					outb(SMC935_IND, SMC935_PPMODE);
 					outb(SMC935_DAT, SMC935_ECPEPP19);
 				}
 				if (ppc->ppc_epp == EPP_1_7) {
 					outb(SMC935_IND, SMC935_PPMODE);
 					outb(SMC935_DAT, SMC935_ECPEPP17);
 				}
 			} else {
 				outb(SMC935_IND, SMC935_PPMODE);
 				outb(SMC935_DAT, SMC935_ECP);
 			}
 		}
 	}
 
 	outb(SMC935_CFG, 0xaa); /* exit config mode */
 
 	ppc->ppc_type = PPC_TYPE_SMCLIKE;
 	ppc_smclike_setmode(ppc, chipset_mode);
 
 	return (chipset_mode);
 }
 
 /*
  * Winbond W83877F stuff
  *
  * EFER: extended function enable register
  * EFIR: extended function index register
  * EFDR: extended function data register
  */
 #define efir ((efer == 0x250) ? 0x251 : 0x3f0)
 #define efdr ((efer == 0x250) ? 0x252 : 0x3f1)
 
 static int w83877f_efers[] = { 0x250, 0x3f0, 0x3f0, 0x250 };
 static int w83877f_keys[] = { 0x89, 0x86, 0x87, 0x88 };
 static int w83877f_keyiter[] = { 1, 2, 2, 1 };
 static int w83877f_hefs[] = { WINB_HEFERE, WINB_HEFRAS, WINB_HEFERE | WINB_HEFRAS, 0 };
 
 static int
 ppc_w83877f_detect(struct ppc_data *ppc, int chipset_mode)
 {
 	int i, j, efer;
 	unsigned char r, hefere, hefras;
 
 	for (i = 0; i < 4; i ++) {
 		/* first try to enable configuration registers */
 		efer = w83877f_efers[i];
 
 		/* write the key to the EFER */
 		for (j = 0; j < w83877f_keyiter[i]; j ++)
 			outb (efer, w83877f_keys[i]);
 
 		/* then check HEFERE and HEFRAS bits */
 		outb (efir, 0x0c);
 		hefere = inb(efdr) & WINB_HEFERE;
 
 		outb (efir, 0x16);
 		hefras = inb(efdr) & WINB_HEFRAS;
 
 		/*
 		 * HEFRAS	HEFERE
 		 *   0		   1	write 89h to 250h (power-on default)
 		 *   1		   0	write 86h twice to 3f0h
 		 *   1		   1	write 87h twice to 3f0h
 		 *   0		   0	write 88h to 250h
 		 */
 		if ((hefere | hefras) == w83877f_hefs[i])
 			goto found;
 	}
 
 	return (-1);	/* failed */
 
 found:
 	/* check base port address - read from CR23 */
 	outb(efir, 0x23);
 	if (ppc->ppc_base != inb(efdr) * 4)		/* 4 bytes boundaries */
 		return (-1);
 
 	/* read CHIP ID from CR9/bits0-3 */
 	outb(efir, 0x9);
 
 	switch (inb(efdr) & WINB_CHIPID) {
 		case WINB_W83877F_ID:
 			ppc->ppc_model = WINB_W83877F;
 			break;
 
 		case WINB_W83877AF_ID:
 			ppc->ppc_model = WINB_W83877AF;
 			break;
 
 		default:
 			ppc->ppc_model = WINB_UNKNOWN;
 	}
 
 	if (bootverbose) {
 		/* dump of registers */
 		device_printf(ppc->ppc_dev, "0x%x - ", w83877f_keys[i]);
 		for (i = 0; i <= 0xd; i ++) {
 			outb(efir, i);
 			printf("0x%x ", inb(efdr));
 		}
 		for (i = 0x10; i <= 0x17; i ++) {
 			outb(efir, i);
 			printf("0x%x ", inb(efdr));
 		}
 		outb(efir, 0x1e);
 		printf("0x%x ", inb(efdr));
 		for (i = 0x20; i <= 0x29; i ++) {
 			outb(efir, i);
 			printf("0x%x ", inb(efdr));
 		}
 		printf("\n");
 	}
 
 	ppc->ppc_type = PPC_TYPE_GENERIC;
 
 	if (!chipset_mode) {
 		/* autodetect mode */
 
 		/* select CR0 */
 		outb(efir, 0x0);
 		r = inb(efdr) & (WINB_PRTMODS0 | WINB_PRTMODS1);
 
 		/* select CR9 */
 		outb(efir, 0x9);
 		r |= (inb(efdr) & WINB_PRTMODS2);
 
 		switch (r) {
 		case WINB_W83757:
 			if (bootverbose)
 				device_printf(ppc->ppc_dev,
 				    "W83757 compatible mode\n");
 			return (-1);	/* generic or SMC-like */
 
 		case WINB_EXTFDC:
 		case WINB_EXTADP:
 		case WINB_EXT2FDD:
 		case WINB_JOYSTICK:
 			if (bootverbose)
 				device_printf(ppc->ppc_dev,
 				    "not in parallel port mode\n");
 			return (-1);
 
 		case (WINB_PARALLEL | WINB_EPP_SPP):
 			ppc->ppc_avm |= PPB_EPP | PPB_SPP;
 			if (bootverbose)
 				device_printf(ppc->ppc_dev, "EPP SPP\n");
 			break;
 
 		case (WINB_PARALLEL | WINB_ECP):
 			ppc->ppc_avm |= PPB_ECP | PPB_SPP;
 			if (bootverbose)
 				device_printf(ppc->ppc_dev, "ECP SPP\n");
 			break;
 
 		case (WINB_PARALLEL | WINB_ECP_EPP):
 			ppc->ppc_avm |= PPB_ECP | PPB_EPP | PPB_SPP;
 			ppc->ppc_type = PPC_TYPE_SMCLIKE;
 
 			if (bootverbose)
 				device_printf(ppc->ppc_dev, "ECP+EPP SPP\n");
 			break;
 		default:
 			printf("%s: unknown case (0x%x)!\n", __func__, r);
 		}
 
 	} else {
 		/* mode forced */
 
 		/* select CR9 and set PRTMODS2 bit */
 		outb(efir, 0x9);
 		outb(efdr, inb(efdr) & ~WINB_PRTMODS2);
 
 		/* select CR0 and reset PRTMODSx bits */
 		outb(efir, 0x0);
 		outb(efdr, inb(efdr) & ~(WINB_PRTMODS0 | WINB_PRTMODS1));
 
 		if (chipset_mode & PPB_ECP) {
 			if (chipset_mode & PPB_EPP) {
 				outb(efdr, inb(efdr) | WINB_ECP_EPP);
 				if (bootverbose)
 					device_printf(ppc->ppc_dev,
 					    "ECP+EPP\n");
 
 				ppc->ppc_type = PPC_TYPE_SMCLIKE;
 
 			} else {
 				outb(efdr, inb(efdr) | WINB_ECP);
 				if (bootverbose)
 					device_printf(ppc->ppc_dev, "ECP\n");
 			}
 		} else {
 			/* select EPP_SPP otherwise */
 			outb(efdr, inb(efdr) | WINB_EPP_SPP);
 			if (bootverbose)
 				device_printf(ppc->ppc_dev, "EPP SPP\n");
 		}
 		ppc->ppc_avm = chipset_mode;
 	}
 
 	/* exit configuration mode */
 	outb(efer, 0xaa);
 
 	switch (ppc->ppc_type) {
 	case PPC_TYPE_SMCLIKE:
 		ppc_smclike_setmode(ppc, chipset_mode);
 		break;
 	default:
 		ppc_generic_setmode(ppc, chipset_mode);
 		break;
 	}
 
 	return (chipset_mode);
 }
 #endif
 
 /*
  * ppc_generic_detect
  */
 static int
 ppc_generic_detect(struct ppc_data *ppc, int chipset_mode)
 {
 	/* default to generic */
 	ppc->ppc_type = PPC_TYPE_GENERIC;
 
 	if (bootverbose)
 		device_printf(ppc->ppc_dev, "SPP");
 
 	/* first, check for ECP */
 	w_ecr(ppc, PPC_ECR_PS2);
 	if ((r_ecr(ppc) & 0xe0) == PPC_ECR_PS2) {
 		ppc->ppc_dtm |= PPB_ECP | PPB_SPP;
 		if (bootverbose)
 			printf(" ECP ");
 
 		/* search for SMC style ECP+EPP mode */
 		w_ecr(ppc, PPC_ECR_EPP);
 	}
 
 	/* try to reset EPP timeout bit */
 	if (ppc_check_epp_timeout(ppc)) {
 		ppc->ppc_dtm |= PPB_EPP;
 
 		if (ppc->ppc_dtm & PPB_ECP) {
 			/* SMC like chipset found */
 			ppc->ppc_model = SMC_LIKE;
 			ppc->ppc_type = PPC_TYPE_SMCLIKE;
 
 			if (bootverbose)
 				printf(" ECP+EPP");
 		} else {
 			if (bootverbose)
 				printf(" EPP");
 		}
 	} else {
 		/* restore to standard mode */
 		w_ecr(ppc, PPC_ECR_STD);
 	}
 
 	/* XXX try to detect NIBBLE and PS2 modes */
 	ppc->ppc_dtm |= PPB_NIBBLE;
 
 	if (chipset_mode)
 		ppc->ppc_avm = chipset_mode;
 	else
 		ppc->ppc_avm = ppc->ppc_dtm;
 
 	if (bootverbose)
 		printf("\n");
 
 	switch (ppc->ppc_type) {
 	case PPC_TYPE_SMCLIKE:
 		ppc_smclike_setmode(ppc, chipset_mode);
 		break;
 	default:
 		ppc_generic_setmode(ppc, chipset_mode);
 		break;
 	}
 
 	return (chipset_mode);
 }
 
 /*
  * ppc_detect()
  *
  * mode is the mode suggested at boot
  */
 static int
 ppc_detect(struct ppc_data *ppc, int chipset_mode) {
 
 #ifdef PPC_PROBE_CHIPSET
 	int i, mode;
 
 	/* list of supported chipsets */
 	int (*chipset_detect[])(struct ppc_data *, int) = {
 		ppc_pc873xx_detect,
 		ppc_smc37c66xgt_detect,
 		ppc_w83877f_detect,
 		ppc_smc37c935_detect,
 		ppc_generic_detect,
 		NULL
 	};
 #endif
 
 	/* if can't find the port and mode not forced return error */
 	if (!ppc_detect_port(ppc) && chipset_mode == 0)
 		return (EIO);			/* failed, port not present */
 
 	/* assume centronics compatible mode is supported */
 	ppc->ppc_avm = PPB_COMPATIBLE;
 
 #ifdef PPC_PROBE_CHIPSET
 	/* we have to differenciate available chipset modes,
 	 * chipset running modes and IEEE-1284 operating modes
 	 *
 	 * after detection, the port must support running in compatible mode
 	 */
 	if (ppc->ppc_flags & 0x40) {
 		if (bootverbose)
 			printf("ppc: chipset forced to generic\n");
 #endif
 
 		ppc->ppc_mode = ppc_generic_detect(ppc, chipset_mode);
 
 #ifdef PPC_PROBE_CHIPSET
 	} else {
 		for (i=0; chipset_detect[i] != NULL; i++) {
 			if ((mode = chipset_detect[i](ppc, chipset_mode)) != -1) {
 				ppc->ppc_mode = mode;
 				break;
 			}
 		}
 	}
 #endif
 
 	/* configure/detect ECP FIFO */
 	if ((ppc->ppc_avm & PPB_ECP) && !(ppc->ppc_flags & 0x80))
 		ppc_detect_fifo(ppc);
 
 	return (0);
 }
 
 /*
  * ppc_exec_microseq()
  *
  * Execute a microsequence.
  * Microsequence mechanism is supposed to handle fast I/O operations.
  */
 int
 ppc_exec_microseq(device_t dev, struct ppb_microseq **p_msq)
 {
 	struct ppc_data *ppc = DEVTOSOFTC(dev);
 	struct ppb_microseq *mi;
 	char cc, *p;
 	int i, iter, len;
 	int error;
 
 	int reg;
 	char mask;
 	int accum = 0;
 	char *ptr = NULL;
 
 	struct ppb_microseq *stack = NULL;
 
 /* microsequence registers are equivalent to PC-like port registers */
 
 #define r_reg(reg,ppc) (bus_read_1((ppc)->res_ioport, reg))
 #define w_reg(reg, ppc, byte) (bus_write_1((ppc)->res_ioport, reg, byte))
 
 #define INCR_PC (mi ++)		/* increment program counter */
 
 	PPC_ASSERT_LOCKED(ppc);
 	mi = *p_msq;
 	for (;;) {
 		switch (mi->opcode) {
 		case MS_OP_RSET:
 			cc = r_reg(mi->arg[0].i, ppc);
 			cc &= (char)mi->arg[2].i;	/* clear mask */
 			cc |= (char)mi->arg[1].i;	/* assert mask */
 			w_reg(mi->arg[0].i, ppc, cc);
 			INCR_PC;
 			break;
 
 		case MS_OP_RASSERT_P:
 			reg = mi->arg[1].i;
 			ptr = ppc->ppc_ptr;
 
 			if ((len = mi->arg[0].i) == MS_ACCUM) {
 				accum = ppc->ppc_accum;
 				for (; accum; accum--)
 					w_reg(reg, ppc, *ptr++);
 				ppc->ppc_accum = accum;
 			} else
 				for (i=0; i<len; i++)
 					w_reg(reg, ppc, *ptr++);
 			ppc->ppc_ptr = ptr;
 
 			INCR_PC;
 			break;
 
 		case MS_OP_RFETCH_P:
 			reg = mi->arg[1].i;
 			mask = (char)mi->arg[2].i;
 			ptr = ppc->ppc_ptr;
 
 			if ((len = mi->arg[0].i) == MS_ACCUM) {
 				accum = ppc->ppc_accum;
 				for (; accum; accum--)
 					*ptr++ = r_reg(reg, ppc) & mask;
 				ppc->ppc_accum = accum;
 			} else
 				for (i=0; i<len; i++)
 					*ptr++ = r_reg(reg, ppc) & mask;
 			ppc->ppc_ptr = ptr;
 
 			INCR_PC;
 			break;
 
 		case MS_OP_RFETCH:
 			*((char *) mi->arg[2].p) = r_reg(mi->arg[0].i, ppc) &
 							(char)mi->arg[1].i;
 			INCR_PC;
 			break;
 
 		case MS_OP_RASSERT:
 		case MS_OP_DELAY:
 
 		/* let's suppose the next instr. is the same */
 		prefetch:
 			for (;mi->opcode == MS_OP_RASSERT; INCR_PC)
 				w_reg(mi->arg[0].i, ppc, (char)mi->arg[1].i);
 
 			if (mi->opcode == MS_OP_DELAY) {
 				DELAY(mi->arg[0].i);
 				INCR_PC;
 				goto prefetch;
 			}
 			break;
 
 		case MS_OP_ADELAY:
 			if (mi->arg[0].i) {
 				PPC_UNLOCK(ppc);
 				pause("ppbdelay", mi->arg[0].i * (hz/1000));
 				PPC_LOCK(ppc);
 			}
 			INCR_PC;
 			break;
 
 		case MS_OP_TRIG:
 			reg = mi->arg[0].i;
 			iter = mi->arg[1].i;
 			p = (char *)mi->arg[2].p;
 
 			/* XXX delay limited to 255 us */
 			for (i=0; i<iter; i++) {
 				w_reg(reg, ppc, *p++);
 				DELAY((unsigned char)*p++);
 			}
 			INCR_PC;
 			break;
 
 		case MS_OP_SET:
 			ppc->ppc_accum = mi->arg[0].i;
 			INCR_PC;
 			break;
 
 		case MS_OP_DBRA:
 			if (--ppc->ppc_accum > 0)
 				mi += mi->arg[0].i;
 			INCR_PC;
 			break;
 
 		case MS_OP_BRSET:
 			cc = r_str(ppc);
 			if ((cc & (char)mi->arg[0].i) == (char)mi->arg[0].i)
 				mi += mi->arg[1].i;
 			INCR_PC;
 			break;
 
 		case MS_OP_BRCLEAR:
 			cc = r_str(ppc);
 			if ((cc & (char)mi->arg[0].i) == 0)
 				mi += mi->arg[1].i;
 			INCR_PC;
 			break;
 
 		case MS_OP_BRSTAT:
 			cc = r_str(ppc);
 			if ((cc & ((char)mi->arg[0].i | (char)mi->arg[1].i)) ==
 							(char)mi->arg[0].i)
 				mi += mi->arg[2].i;
 			INCR_PC;
 			break;
 
 		case MS_OP_C_CALL:
 			/*
 			 * If the C call returns !0 then end the microseq.
 			 * The current state of ptr is passed to the C function
 			 */
 			if ((error = mi->arg[0].f(mi->arg[1].p, ppc->ppc_ptr)))
 				return (error);
 
 			INCR_PC;
 			break;
 
 		case MS_OP_PTR:
 			ppc->ppc_ptr = (char *)mi->arg[0].p;
 			INCR_PC;
 			break;
 
 		case MS_OP_CALL:
 			if (stack)
 				panic("%s: too much calls", __func__);
 
 			if (mi->arg[0].p) {
 				/* store the state of the actual
 				 * microsequence
 				 */
 				stack = mi;
 
 				/* jump to the new microsequence */
 				mi = (struct ppb_microseq *)mi->arg[0].p;
 			} else
 				INCR_PC;
 
 			break;
 
 		case MS_OP_SUBRET:
 			/* retrieve microseq and pc state before the call */
 			mi = stack;
 
 			/* reset the stack */
 			stack = NULL;
 
 			/* XXX return code */
 
 			INCR_PC;
 			break;
 
 		case MS_OP_PUT:
 		case MS_OP_GET:
 		case MS_OP_RET:
 			/* can't return to ppb level during the execution
 			 * of a submicrosequence */
 			if (stack)
 				panic("%s: can't return to ppb level",
 								__func__);
 
 			/* update pc for ppb level of execution */
 			*p_msq = mi;
 
 			/* return to ppb level of execution */
 			return (0);
 
 		default:
 			panic("%s: unknown microsequence opcode 0x%x",
 			    __func__, mi->opcode);
 		}
 	}
 
 	/* unreached */
 }
 
 static void
 ppcintr(void *arg)
 {
 	struct ppc_data *ppc = arg;
 	u_char ctr, ecr, str;
 
 	/*
 	 * If we have any child interrupt handlers registered, let
 	 * them handle this interrupt.
 	 *
 	 * XXX: If DMA is in progress should we just complete that w/o
 	 * doing this?
 	 */
 	PPC_LOCK(ppc);
 	if (ppc->ppc_intr_hook != NULL &&
 	    ppc->ppc_intr_hook(ppc->ppc_intr_arg) == 0) {
 		PPC_UNLOCK(ppc);
 		return;
 	}
 
 	str = r_str(ppc);
 	ctr = r_ctr(ppc);
 	ecr = r_ecr(ppc);
 
 #if defined(PPC_DEBUG) && PPC_DEBUG > 1
 		printf("![%x/%x/%x]", ctr, ecr, str);
 #endif
 
 	/* don't use ecp mode with IRQENABLE set */
 	if (ctr & IRQENABLE) {
 		PPC_UNLOCK(ppc);
 		return;
 	}
 
 	/* interrupts are generated by nFault signal
 	 * only in ECP mode */
 	if ((str & nFAULT) && (ppc->ppc_mode & PPB_ECP)) {
 		/* check if ppc driver has programmed the
 		 * nFault interrupt */
 		if  (ppc->ppc_irqstat & PPC_IRQ_nFAULT) {
 
 			w_ecr(ppc, ecr | PPC_nFAULT_INTR);
 			ppc->ppc_irqstat &= ~PPC_IRQ_nFAULT;
 		} else {
 			/* shall be handled by underlying layers XXX */
 			PPC_UNLOCK(ppc);
 			return;
 		}
 	}
 
 	if (ppc->ppc_irqstat & PPC_IRQ_DMA) {
 		/* disable interrupts (should be done by hardware though) */
 		w_ecr(ppc, ecr | PPC_SERVICE_INTR);
 		ppc->ppc_irqstat &= ~PPC_IRQ_DMA;
 		ecr = r_ecr(ppc);
 
 		/* check if DMA completed */
 		if ((ppc->ppc_avm & PPB_ECP) && (ecr & PPC_ENABLE_DMA)) {
 #ifdef PPC_DEBUG
 			printf("a");
 #endif
 			/* stop DMA */
 			w_ecr(ppc, ecr & ~PPC_ENABLE_DMA);
 			ecr = r_ecr(ppc);
 
 			if (ppc->ppc_dmastat == PPC_DMA_STARTED) {
 #ifdef PPC_DEBUG
 				printf("d");
 #endif
 				ppc->ppc_dmadone(ppc);
 				ppc->ppc_dmastat = PPC_DMA_COMPLETE;
 
 				/* wakeup the waiting process */
 				wakeup(ppc);
 			}
 		}
 	} else if (ppc->ppc_irqstat & PPC_IRQ_FIFO) {
 
 		/* classic interrupt I/O */
 		ppc->ppc_irqstat &= ~PPC_IRQ_FIFO;
 	}
 	PPC_UNLOCK(ppc);
 
 	return;
 }
 
 int
 ppc_read(device_t dev, char *buf, int len, int mode)
 {
 	return (EINVAL);
 }
 
 int
 ppc_write(device_t dev, char *buf, int len, int how)
 {
 	return (EINVAL);
 }
 
 int
 ppc_reset_epp(device_t dev)
 {
 	struct ppc_data *ppc = DEVTOSOFTC(dev);
 
 	PPC_ASSERT_LOCKED(ppc);
 	ppc_reset_epp_timeout(ppc);
 
 	return 0;
 }
 
 int
 ppc_setmode(device_t dev, int mode)
 {
 	struct ppc_data *ppc = DEVTOSOFTC(dev);
 
 	PPC_ASSERT_LOCKED(ppc);
 	switch (ppc->ppc_type) {
 	case PPC_TYPE_SMCLIKE:
 		return (ppc_smclike_setmode(ppc, mode));
 		break;
 
 	case PPC_TYPE_GENERIC:
 	default:
 		return (ppc_generic_setmode(ppc, mode));
 		break;
 	}
 
 	/* not reached */
 	return (ENXIO);
 }
 
 int
 ppc_probe(device_t dev, int rid)
 {
 #ifdef __i386__
 	static short next_bios_ppc = 0;
 #endif
 	struct ppc_data *ppc;
 	int error;
 	rman_res_t port;
 
 	/*
 	 * Allocate the ppc_data structure.
 	 */
 	ppc = DEVTOSOFTC(dev);
 	bzero(ppc, sizeof(struct ppc_data));
 
 	ppc->rid_ioport = rid;
 
 	/* retrieve ISA parameters */
 	error = bus_get_resource(dev, SYS_RES_IOPORT, rid, &port, NULL);
 
 #ifdef __i386__
 	/*
 	 * If port not specified, use bios list.
 	 */
 	if (error) {
 		if ((next_bios_ppc < BIOS_MAX_PPC) &&
 		    (*(BIOS_PORTS + next_bios_ppc) != 0)) {
 			port = *(BIOS_PORTS + next_bios_ppc++);
 			if (bootverbose)
 				device_printf(dev,
 				    "parallel port found at 0x%jx\n", port);
 		} else {
 			device_printf(dev, "parallel port not found.\n");
 			return (ENXIO);
 		}
 		bus_set_resource(dev, SYS_RES_IOPORT, rid, port,
 				 IO_LPTSIZE_EXTENDED);
 	}
 #endif
 
 	/* IO port is mandatory */
 
 	/* Try "extended" IO port range...*/
 	ppc->res_ioport = bus_alloc_resource_anywhere(dev, SYS_RES_IOPORT,
 						      &ppc->rid_ioport,
 						      IO_LPTSIZE_EXTENDED,
 						      RF_ACTIVE);
 
 	if (ppc->res_ioport != 0) {
 		if (bootverbose)
 			device_printf(dev, "using extended I/O port range\n");
 	} else {
 		/* Failed? If so, then try the "normal" IO port range... */
 		 ppc->res_ioport = bus_alloc_resource_anywhere(dev,
 		 	 				       SYS_RES_IOPORT,
 							       &ppc->rid_ioport,
 							       IO_LPTSIZE_NORMAL,
 							       RF_ACTIVE);
 		if (ppc->res_ioport != 0) {
 			if (bootverbose)
 				device_printf(dev, "using normal I/O port range\n");
 		} else {
 			if (bootverbose)
 				device_printf(dev, "cannot reserve I/O port range\n");
 			goto error;
 		}
 	}
 
  	ppc->ppc_base = rman_get_start(ppc->res_ioport);
 
 	ppc->ppc_flags = device_get_flags(dev);
 
 	if (!(ppc->ppc_flags & 0x20)) {
 		ppc->res_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ,
 						      &ppc->rid_irq,
 						      RF_SHAREABLE);
 		ppc->res_drq = bus_alloc_resource_any(dev, SYS_RES_DRQ,
 						      &ppc->rid_drq,
 						      RF_ACTIVE);
 	}
 
 	if (ppc->res_irq)
 		ppc->ppc_irq = rman_get_start(ppc->res_irq);
 	if (ppc->res_drq)
 		ppc->ppc_dmachan = rman_get_start(ppc->res_drq);
 
 	ppc->ppc_dev = dev;
 	ppc->ppc_model = GENERIC;
 
 	ppc->ppc_mode = PPB_COMPATIBLE;
 	ppc->ppc_epp = (ppc->ppc_flags & 0x10) >> 4;
 
 	ppc->ppc_type = PPC_TYPE_GENERIC;
 
 	/*
 	 * Try to detect the chipset and its mode.
 	 */
 	if (ppc_detect(ppc, ppc->ppc_flags & 0xf))
 		goto error;
 
 	return (0);
 
 error:
 	if (ppc->res_irq != 0) {
 		bus_release_resource(dev, SYS_RES_IRQ, ppc->rid_irq,
 				     ppc->res_irq);
 	}
 	if (ppc->res_ioport != 0) {
 		bus_release_resource(dev, SYS_RES_IOPORT, ppc->rid_ioport,
 				     ppc->res_ioport);
 	}
 	if (ppc->res_drq != 0) {
 		bus_release_resource(dev, SYS_RES_DRQ, ppc->rid_drq,
 				     ppc->res_drq);
 	}
 	return (ENXIO);
 }
 
 int
 ppc_attach(device_t dev)
 {
 	struct ppc_data *ppc = DEVTOSOFTC(dev);
 	int error;
 
 	mtx_init(&ppc->ppc_lock, device_get_nameunit(dev), "ppc", MTX_DEF);
 
 	device_printf(dev, "%s chipset (%s) in %s mode%s\n",
 		      ppc_models[ppc->ppc_model], ppc_avms[ppc->ppc_avm],
 		      ppc_modes[ppc->ppc_mode], (PPB_IS_EPP(ppc->ppc_mode)) ?
 		      ppc_epp_protocol[ppc->ppc_epp] : "");
 
 	if (ppc->ppc_fifo)
 		device_printf(dev, "FIFO with %d/%d/%d bytes threshold\n",
 			      ppc->ppc_fifo, ppc->ppc_wthr, ppc->ppc_rthr);
 
 	if (ppc->res_irq) {
 		/* default to the tty mask for registration */	/* XXX */
 		error = bus_setup_intr(dev, ppc->res_irq, INTR_TYPE_TTY |
 		    INTR_MPSAFE, NULL, ppcintr, ppc, &ppc->intr_cookie);
 		if (error) {
 			device_printf(dev,
 			    "failed to register interrupt handler: %d\n",
 			    error);
 			mtx_destroy(&ppc->ppc_lock);
 			return (error);
 		}
 	}
 
 	/* add ppbus as a child of this isa to parallel bridge */
 	ppc->ppbus = device_add_child(dev, "ppbus", -1);
 
 	/*
 	 * Probe the ppbus and attach devices found.
 	 */
 	device_probe_and_attach(ppc->ppbus);
 
 	return (0);
 }
 
 int
 ppc_detach(device_t dev)
 {
 	struct ppc_data *ppc = DEVTOSOFTC(dev);
 
 	if (ppc->res_irq == 0) {
 		return (ENXIO);
 	}
 
 	/* detach & delete all children */
 	device_delete_children(dev);
 
 	if (ppc->res_irq != 0) {
 		bus_teardown_intr(dev, ppc->res_irq, ppc->intr_cookie);
 		bus_release_resource(dev, SYS_RES_IRQ, ppc->rid_irq,
 				     ppc->res_irq);
 	}
 	if (ppc->res_ioport != 0) {
 		bus_release_resource(dev, SYS_RES_IOPORT, ppc->rid_ioport,
 				     ppc->res_ioport);
 	}
 	if (ppc->res_drq != 0) {
 		bus_release_resource(dev, SYS_RES_DRQ, ppc->rid_drq,
 				     ppc->res_drq);
 	}
 
 	mtx_destroy(&ppc->ppc_lock);
 
 	return (0);
 }
 
 u_char
 ppc_io(device_t ppcdev, int iop, u_char *addr, int cnt, u_char byte)
 {
 	struct ppc_data *ppc = DEVTOSOFTC(ppcdev);
 
 	PPC_ASSERT_LOCKED(ppc);
 	switch (iop) {
 	case PPB_OUTSB_EPP:
 	    bus_write_multi_1(ppc->res_ioport, PPC_EPP_DATA, addr, cnt);
 		break;
 	case PPB_OUTSW_EPP:
 	    bus_write_multi_2(ppc->res_ioport, PPC_EPP_DATA, (u_int16_t *)addr, cnt);
 		break;
 	case PPB_OUTSL_EPP:
 	    bus_write_multi_4(ppc->res_ioport, PPC_EPP_DATA, (u_int32_t *)addr, cnt);
 		break;
 	case PPB_INSB_EPP:
 	    bus_read_multi_1(ppc->res_ioport, PPC_EPP_DATA, addr, cnt);
 		break;
 	case PPB_INSW_EPP:
 	    bus_read_multi_2(ppc->res_ioport, PPC_EPP_DATA, (u_int16_t *)addr, cnt);
 		break;
 	case PPB_INSL_EPP:
 	    bus_read_multi_4(ppc->res_ioport, PPC_EPP_DATA, (u_int32_t *)addr, cnt);
 		break;
 	case PPB_RDTR:
 		return (r_dtr(ppc));
 	case PPB_RSTR:
 		return (r_str(ppc));
 	case PPB_RCTR:
 		return (r_ctr(ppc));
 	case PPB_REPP_A:
 		return (r_epp_A(ppc));
 	case PPB_REPP_D:
 		return (r_epp_D(ppc));
 	case PPB_RECR:
 		return (r_ecr(ppc));
 	case PPB_RFIFO:
 		return (r_fifo(ppc));
 	case PPB_WDTR:
 		w_dtr(ppc, byte);
 		break;
 	case PPB_WSTR:
 		w_str(ppc, byte);
 		break;
 	case PPB_WCTR:
 		w_ctr(ppc, byte);
 		break;
 	case PPB_WEPP_A:
 		w_epp_A(ppc, byte);
 		break;
 	case PPB_WEPP_D:
 		w_epp_D(ppc, byte);
 		break;
 	case PPB_WECR:
 		w_ecr(ppc, byte);
 		break;
 	case PPB_WFIFO:
 		w_fifo(ppc, byte);
 		break;
 	default:
 		panic("%s: unknown I/O operation", __func__);
 		break;
 	}
 
 	return (0);	/* not significative */
 }
 
 int
 ppc_read_ivar(device_t bus, device_t dev, int index, uintptr_t *val)
 {
 	struct ppc_data *ppc = (struct ppc_data *)device_get_softc(bus);
 
 	switch (index) {
 	case PPC_IVAR_EPP_PROTO:
 		PPC_ASSERT_LOCKED(ppc);
 		*val = (u_long)ppc->ppc_epp;
 		break;
 	case PPC_IVAR_LOCK:
 		*val = (uintptr_t)&ppc->ppc_lock;
 		break;
 	default:
 		return (ENOENT);
 	}
 
 	return (0);
 }
 
 int
 ppc_write_ivar(device_t bus, device_t dev, int index, uintptr_t val)
 {
 	struct ppc_data *ppc = (struct ppc_data *)device_get_softc(bus);
 
 	switch (index) {
 	case PPC_IVAR_INTR_HANDLER:
 		PPC_ASSERT_LOCKED(ppc);
 		if (dev != ppc->ppbus)
 			return (EINVAL);
 		if (val == 0) {
 			ppc->ppc_intr_hook = NULL;
 			break;
 		}
 		if (ppc->ppc_intr_hook != NULL)
 			return (EBUSY);
 		ppc->ppc_intr_hook = (void *)val;
 		ppc->ppc_intr_arg = device_get_softc(dev);
 		break;
 	default:
 		return (ENOENT);
 	}
 
 	return (0);
 }
 
 /*
  * We allow child devices to allocate an IRQ resource at rid 0 for their
  * interrupt handlers.
  */
 struct resource *
 ppc_alloc_resource(device_t bus, device_t child, int type, int *rid,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct ppc_data *ppc = DEVTOSOFTC(bus);
 
 	switch (type) {
 	case SYS_RES_IRQ:
 		if (*rid == 0)
 			return (ppc->res_irq);
 		break;
 	}
 	return (NULL);
 }
 
 int
 ppc_release_resource(device_t bus, device_t child, int type, int rid,
     struct resource *r)
 {
 #ifdef INVARIANTS
 	struct ppc_data *ppc = DEVTOSOFTC(bus);
 #endif
 
 	switch (type) {
 	case SYS_RES_IRQ:
 		if (rid == 0) {
 			KASSERT(r == ppc->res_irq,
 			    ("ppc child IRQ resource mismatch"));
 			return (0);
 		}
 		break;
 	}
 	return (EINVAL);
 }
 
 MODULE_DEPEND(ppc, ppbus, 1, 1, 1);
Index: head/sys/dev/syscons/syscons.c
===================================================================
--- head/sys/dev/syscons/syscons.c	(revision 332488)
+++ head/sys/dev/syscons/syscons.c	(revision 332489)
@@ -1,4249 +1,4253 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1992-1998 Søren Schmidt
  * All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Sascha Wildner <saw@online.de>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_syscons.h"
 #include "opt_splash.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/consio.h>
 #include <sys/kdb.h>
 #include <sys/eventhandler.h>
 #include <sys/fbio.h>
 #include <sys/kbio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/reboot.h>
 #include <sys/serial.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/tty.h>
 #include <sys/power.h>
 
 #include <machine/clock.h>
 #if defined(__arm__) || defined(__mips__) || \
 	defined(__powerpc__) || defined(__sparc64__)
 #include <machine/sc_machdep.h>
 #else
 #include <machine/pc/display.h>
 #endif
 #if defined( __i386__) || defined(__amd64__)
 #include <machine/psl.h>
 #include <machine/frame.h>
 #endif
 #include <machine/stdarg.h>
 
 #if defined(__amd64__) || defined(__i386__)
 #include <machine/vmparam.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #endif
 
 #include <dev/kbd/kbdreg.h>
 #include <dev/fb/fbreg.h>
 #include <dev/fb/splashreg.h>
 #include <dev/syscons/syscons.h>
 
 #define COLD 0
 #define WARM 1
 
 #define DEFAULT_BLANKTIME	(5*60)		/* 5 minutes */
 #define MAX_BLANKTIME		(7*24*60*60)	/* 7 days!? */
 
 #define KEYCODE_BS		0x0e		/* "<-- Backspace" key, XXX */
 
 /* NULL-safe version of "tty_opened()" */
 #define	tty_opened_ns(tp)	((tp) != NULL && tty_opened(tp))
 
 static	u_char		sc_kattrtab[MAXCPU];
 
 static	int		sc_console_unit = -1;
 static	int		sc_saver_keyb_only = 1;
 static  scr_stat    	*sc_console;
 static  struct consdev	*sc_consptr;
 static	void		*sc_kts[MAXCPU];
 static	struct sc_term_sw *sc_ktsw;
 static	scr_stat	main_console;
 static	struct tty 	*main_devs[MAXCONS];
 
 static  char        	init_done = COLD;
 static	int		shutdown_in_progress = FALSE;
 static	int		suspend_in_progress = FALSE;
 static	char		sc_malloc = FALSE;
 
 static	int		saver_mode = CONS_NO_SAVER; /* LKM/user saver */
 static	int		run_scrn_saver = FALSE;	/* should run the saver? */
 static	int		enable_bell = TRUE; /* enable beeper */
 
 #ifndef SC_DISABLE_REBOOT
 static  int		enable_reboot = TRUE; /* enable keyboard reboot */
 #endif
 
 #ifndef SC_DISABLE_KDBKEY
 static  int		enable_kdbkey = TRUE; /* enable keyboard debug */
 #endif
 
 static	long        	scrn_blank_time = 0;    /* screen saver timeout value */
 #ifdef DEV_SPLASH
 static	int     	scrn_blanked;		/* # of blanked screen */
 static	int		sticky_splash = FALSE;
 
 static	void		none_saver(sc_softc_t *sc, int blank) { }
 static	void		(*current_saver)(sc_softc_t *, int) = none_saver;
 #endif
 
 #ifdef SC_NO_SUSPEND_VTYSWITCH
 static	int		sc_no_suspend_vtswitch = 1;
 #else
 static	int		sc_no_suspend_vtswitch = 0;
 #endif
 static	int		sc_susp_scr;
 
 static SYSCTL_NODE(_hw, OID_AUTO, syscons, CTLFLAG_RD, 0, "syscons");
 static SYSCTL_NODE(_hw_syscons, OID_AUTO, saver, CTLFLAG_RD, 0, "saver");
 SYSCTL_INT(_hw_syscons_saver, OID_AUTO, keybonly, CTLFLAG_RW,
     &sc_saver_keyb_only, 0, "screen saver interrupted by input only");
 SYSCTL_INT(_hw_syscons, OID_AUTO, bell, CTLFLAG_RW, &enable_bell, 
     0, "enable bell");
 #ifndef SC_DISABLE_REBOOT
 SYSCTL_INT(_hw_syscons, OID_AUTO, kbd_reboot, CTLFLAG_RW|CTLFLAG_SECURE, &enable_reboot,
     0, "enable keyboard reboot");
 #endif
 #ifndef SC_DISABLE_KDBKEY
 SYSCTL_INT(_hw_syscons, OID_AUTO, kbd_debug, CTLFLAG_RW|CTLFLAG_SECURE, &enable_kdbkey,
     0, "enable keyboard debug");
 #endif
 SYSCTL_INT(_hw_syscons, OID_AUTO, sc_no_suspend_vtswitch, CTLFLAG_RWTUN,
     &sc_no_suspend_vtswitch, 0, "Disable VT switch before suspend.");
 #if !defined(SC_NO_FONT_LOADING) && defined(SC_DFLT_FONT)
 #include "font.h"
 #endif
 
 	tsw_ioctl_t	*sc_user_ioctl;
 
 static	bios_values_t	bios_value;
 
 static	int		enable_panic_key;
 SYSCTL_INT(_machdep, OID_AUTO, enable_panic_key, CTLFLAG_RW, &enable_panic_key,
 	   0, "Enable panic via keypress specified in kbdmap(5)");
 
 #define SC_CONSOLECTL	255
 
 #define VTY_WCHAN(sc, vty) (&SC_DEV(sc, vty))
 
 /* prototypes */
 static int sc_allocate_keyboard(sc_softc_t *sc, int unit);
 static int scvidprobe(int unit, int flags, int cons);
 static int sckbdprobe(int unit, int flags, int cons);
 static void scmeminit(void *arg);
 static int scdevtounit(struct tty *tp);
 static kbd_callback_func_t sckbdevent;
 static void scinit(int unit, int flags);
 static scr_stat *sc_get_stat(struct tty *tp);
 static void scterm(int unit, int flags);
 static void scshutdown(void *, int);
 static void scsuspend(void *);
 static void scresume(void *);
 static u_int scgetc(sc_softc_t *sc, u_int flags, struct sc_cnstate *sp);
 static void sc_puts(scr_stat *scp, u_char *buf, int len);
 #define SCGETC_CN	1
 #define SCGETC_NONBLOCK	2
 static void sccnupdate(scr_stat *scp);
 static scr_stat *alloc_scp(sc_softc_t *sc, int vty);
 static void init_scp(sc_softc_t *sc, int vty, scr_stat *scp);
 static timeout_t scrn_timer;
 static int and_region(int *s1, int *e1, int s2, int e2);
 static void scrn_update(scr_stat *scp, int show_cursor);
 
 #ifdef DEV_SPLASH
 static int scsplash_callback(int event, void *arg);
 static void scsplash_saver(sc_softc_t *sc, int show);
 static int add_scrn_saver(void (*this_saver)(sc_softc_t *, int));
 static int remove_scrn_saver(void (*this_saver)(sc_softc_t *, int));
 static int set_scrn_saver_mode(scr_stat *scp, int mode, u_char *pal, int border);
 static int restore_scrn_saver_mode(scr_stat *scp, int changemode);
 static void stop_scrn_saver(sc_softc_t *sc, void (*saver)(sc_softc_t *, int));
 static int wait_scrn_saver_stop(sc_softc_t *sc);
 #define scsplash_stick(stick)		(sticky_splash = (stick))
 #else /* !DEV_SPLASH */
 #define scsplash_stick(stick)
 #endif /* DEV_SPLASH */
 
 static int do_switch_scr(sc_softc_t *sc, int s);
 static int vt_proc_alive(scr_stat *scp);
 static int signal_vt_rel(scr_stat *scp);
 static int signal_vt_acq(scr_stat *scp);
 static int finish_vt_rel(scr_stat *scp, int release, int *s);
 static int finish_vt_acq(scr_stat *scp);
 static void exchange_scr(sc_softc_t *sc);
 static void update_cursor_image(scr_stat *scp);
 static void change_cursor_shape(scr_stat *scp, int flags, int base, int height);
 static void update_font(scr_stat *);
 static int save_kbd_state(scr_stat *scp);
 static int update_kbd_state(scr_stat *scp, int state, int mask);
 static int update_kbd_leds(scr_stat *scp, int which);
 static int sc_kattr(void);
 static timeout_t blink_screen;
 static struct tty *sc_alloc_tty(int, int);
 
 static cn_probe_t	sc_cnprobe;
 static cn_init_t	sc_cninit;
 static cn_term_t	sc_cnterm;
 static cn_getc_t	sc_cngetc;
 static cn_putc_t	sc_cnputc;
 static cn_grab_t	sc_cngrab;
 static cn_ungrab_t	sc_cnungrab;
 
 CONSOLE_DRIVER(sc);
 
 static	tsw_open_t	sctty_open;
 static	tsw_close_t	sctty_close;
 static	tsw_outwakeup_t	sctty_outwakeup;
 static	tsw_ioctl_t	sctty_ioctl;
 static	tsw_mmap_t	sctty_mmap;
 
 static struct ttydevsw sc_ttydevsw = {
 	.tsw_open	= sctty_open,
 	.tsw_close	= sctty_close,
 	.tsw_outwakeup	= sctty_outwakeup,
 	.tsw_ioctl	= sctty_ioctl,
 	.tsw_mmap	= sctty_mmap,
 };
 
 static d_ioctl_t	consolectl_ioctl;
 static d_close_t	consolectl_close;
 
 static struct cdevsw consolectl_devsw = {
 	.d_version	= D_VERSION,
 	.d_flags	= D_NEEDGIANT | D_TRACKCLOSE,
 	.d_ioctl	= consolectl_ioctl,
 	.d_close	= consolectl_close,
 	.d_name		= "consolectl",
 };
 
 /* ec -- emergency console. */
 
 static	u_int	ec_scroffset;
 
 static void
 ec_putc(int c)
 {
 	uintptr_t fb;
 	u_short *scrptr;
 	u_int ind;
 	int attr, column, mysize, width, xsize, yborder, ysize;
 
 	if (c < 0 || c > 0xff || c == '\a')
 		return;
 	if (sc_console == NULL) {
 #if !defined(__amd64__) && !defined(__i386__)
 		return;
 #else
 		/*
 		 * This is enough for ec_putc() to work very early on x86
 		 * if the kernel starts in normal color text mode.
 		 */
+#ifdef __amd64__
 		fb = KERNBASE + 0xb8000;
+#else /* __i386__ */
+		fb = PMAP_MAP_LOW + 0xb8000;
+#endif
 		xsize = 80;
 		ysize = 25;
 #endif
 	} else {
 		if (!ISTEXTSC(&main_console))
 			return;
 		fb = main_console.sc->adp->va_window;
 		xsize = main_console.xsize;
 		ysize = main_console.ysize;
 	}
 	yborder = ysize / 5;
 	scrptr = (u_short *)(void *)fb + xsize * yborder;
 	mysize = xsize * (ysize - 2 * yborder);
 	do {
 		ind = ec_scroffset;
 		column = ind % xsize;
 		width = (c == '\b' ? -1 : c == '\t' ? (column + 8) & ~7 :
 		    c == '\r' ? -column : c == '\n' ? xsize - column : 1);
 		if (width == 0 || (width < 0 && ind < -width))
 			return;
 	} while (atomic_cmpset_rel_int(&ec_scroffset, ind, ind + width) == 0);
 	if (c == '\b' || c == '\r')
 		return;
 	if (c == '\n')
 		ind += xsize;	/* XXX clearing from new pos is not atomic */
 
 	attr = sc_kattr();
 	if (c == '\t' || c == '\n')
 		c = ' ';
 	do
 		scrptr[ind++ % mysize] = (attr << 8) | c;
 	while (--width != 0);
 }
 
 int
 sc_probe_unit(int unit, int flags)
 {
     if (!vty_enabled(VTY_SC))
         return ENXIO;
     if (!scvidprobe(unit, flags, FALSE)) {
 	if (bootverbose)
 	    printf("%s%d: no video adapter found.\n", SC_DRIVER_NAME, unit);
 	return ENXIO;
     }
 
     /* syscons will be attached even when there is no keyboard */
     sckbdprobe(unit, flags, FALSE);
 
     return 0;
 }
 
 /* probe video adapters, return TRUE if found */ 
 static int
 scvidprobe(int unit, int flags, int cons)
 {
     /*
      * Access the video adapter driver through the back door!
      * Video adapter drivers need to be configured before syscons.
      * However, when syscons is being probed as the low-level console,
      * they have not been initialized yet.  We force them to initialize
      * themselves here. XXX
      */
     vid_configure(cons ? VIO_PROBE_ONLY : 0);
 
     return (vid_find_adapter("*", unit) >= 0);
 }
 
 /* probe the keyboard, return TRUE if found */
 static int
 sckbdprobe(int unit, int flags, int cons)
 {
     /* access the keyboard driver through the backdoor! */
     kbd_configure(cons ? KB_CONF_PROBE_ONLY : 0);
 
     return (kbd_find_keyboard("*", unit) >= 0);
 }
 
 static char
 *adapter_name(video_adapter_t *adp)
 {
     static struct {
 	int type;
 	char *name[2];
     } names[] = {
 	{ KD_MONO,	{ "MDA",	"MDA" } },
 	{ KD_HERCULES,	{ "Hercules",	"Hercules" } },
 	{ KD_CGA,	{ "CGA",	"CGA" } },
 	{ KD_EGA,	{ "EGA",	"EGA (mono)" } },
 	{ KD_VGA,	{ "VGA",	"VGA (mono)" } },
 	{ KD_TGA,	{ "TGA",	"TGA" } },
 	{ -1,		{ "Unknown",	"Unknown" } },
     };
     int i;
 
     for (i = 0; names[i].type != -1; ++i)
 	if (names[i].type == adp->va_type)
 	    break;
     return names[i].name[(adp->va_flags & V_ADP_COLOR) ? 0 : 1];
 }
 
 static void
 sctty_outwakeup(struct tty *tp)
 {
     size_t len;
     u_char buf[PCBURST];
     scr_stat *scp = sc_get_stat(tp);
 
     if (scp->status & SLKED ||
 	(scp == scp->sc->cur_scp && scp->sc->blink_in_progress))
 	return;
 
     for (;;) {
 	len = ttydisc_getc(tp, buf, sizeof buf);
 	if (len == 0)
 	    break;
 	SC_VIDEO_LOCK(scp->sc);
 	sc_puts(scp, buf, len);
 	SC_VIDEO_UNLOCK(scp->sc);
     }
 }
 
 static struct tty *
 sc_alloc_tty(int index, int devnum)
 {
 	struct sc_ttysoftc *stc;
 	struct tty *tp;
 
 	/* Allocate TTY object and softc to store unit number. */
 	stc = malloc(sizeof(struct sc_ttysoftc), M_DEVBUF, M_WAITOK);
 	stc->st_index = index;
 	stc->st_stat = NULL;
 	tp = tty_alloc_mutex(&sc_ttydevsw, stc, &Giant);
 
 	/* Create device node. */
 	tty_makedev(tp, NULL, "v%r", devnum);
 
 	return (tp);
 }
 
 #ifdef SC_PIXEL_MODE
 static void
 sc_set_vesa_mode(scr_stat *scp, sc_softc_t *sc, int unit)
 {
 	video_info_t info;
 	u_char *font;
 	int depth;
 	int fontsize;
 	int i;
 	int vmode;
 
 	vmode = 0;
 	(void)resource_int_value("sc", unit, "vesa_mode", &vmode);
 	if (vmode < M_VESA_BASE || vmode > M_VESA_MODE_MAX ||
 	    vidd_get_info(sc->adp, vmode, &info) != 0 ||
 	    !sc_support_pixel_mode(&info))
 		vmode = 0;
 
 	/*
 	 * If the mode is unset or unsupported, search for an available
 	 * 800x600 graphics mode with the highest color depth.
 	 */
 	if (vmode == 0) {
 		for (depth = 0, i = M_VESA_BASE; i <= M_VESA_MODE_MAX; i++)
 			if (vidd_get_info(sc->adp, i, &info) == 0 &&
 			    info.vi_width == 800 && info.vi_height == 600 &&
 			    sc_support_pixel_mode(&info) &&
 			    info.vi_depth > depth) {
 				vmode = i;
 				depth = info.vi_depth;
 			}
 		if (vmode == 0)
 			return;
 		vidd_get_info(sc->adp, vmode, &info);
 	}
 
 #if !defined(SC_NO_FONT_LOADING) && defined(SC_DFLT_FONT)
 	fontsize = info.vi_cheight;
 #else
 	fontsize = scp->font_size;
 #endif
 	if (fontsize < 14)
 		fontsize = 8;
 	else if (fontsize >= 16)
 		fontsize = 16;
 	else
 		fontsize = 14;
 #ifndef SC_NO_FONT_LOADING
 	switch (fontsize) {
 	case 8:
 		if ((sc->fonts_loaded & FONT_8) == 0)
 			return;
 		font = sc->font_8;
 		break;
 	case 14:
 		if ((sc->fonts_loaded & FONT_14) == 0)
 			return;
 		font = sc->font_14;
 		break;
 	case 16:
 		if ((sc->fonts_loaded & FONT_16) == 0)
 			return;
 		font = sc->font_16;
 		break;
 	}
 #else
 	font = NULL;
 #endif
 #ifdef DEV_SPLASH
 	if ((sc->flags & SC_SPLASH_SCRN) != 0)
 		splash_term(sc->adp);
 #endif
 #ifndef SC_NO_HISTORY
 	if (scp->history != NULL) {
 		sc_vtb_append(&scp->vtb, 0, scp->history,
 		    scp->ypos * scp->xsize + scp->xpos);
 		scp->history_pos = sc_vtb_tail(scp->history);
 	}
 #endif
 	vidd_set_mode(sc->adp, vmode);
 	scp->status |= (UNKNOWN_MODE | PIXEL_MODE | MOUSE_HIDDEN);
 	scp->status &= ~(GRAPHICS_MODE | MOUSE_VISIBLE);
 	scp->xpixel = info.vi_width;
 	scp->ypixel = info.vi_height;
 	scp->xsize = scp->xpixel / 8;
 	scp->ysize = scp->ypixel / fontsize;
 	scp->xpos = 0;
 	scp->ypos = scp->ysize - 1;
 	scp->xoff = scp->yoff = 0;
 	scp->font = font;
 	scp->font_size = fontsize;
 	scp->font_width = 8;
 	scp->start = scp->xsize * scp->ysize - 1;
 	scp->end = 0;
 	scp->cursor_pos = scp->cursor_oldpos = scp->xsize * scp->xsize;
 	scp->mode = sc->initial_mode = vmode;
 #ifndef __sparc64__
 	sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
 	    (void *)sc->adp->va_window, FALSE);
 #endif
 	sc_alloc_scr_buffer(scp, FALSE, FALSE);
 	sc_init_emulator(scp, NULL);
 #ifndef SC_NO_CUTPASTE
 	sc_alloc_cut_buffer(scp, FALSE);
 #endif
 #ifndef SC_NO_HISTORY
 	sc_alloc_history_buffer(scp, 0, 0, FALSE);
 #endif
 	sc_set_border(scp, scp->border);
 	sc_set_cursor_image(scp);
 	scp->status &= ~UNKNOWN_MODE;
 #ifdef DEV_SPLASH
 	if ((sc->flags & SC_SPLASH_SCRN) != 0)
 		splash_init(sc->adp, scsplash_callback, sc);
 #endif
 }
 #endif
 
 int
 sc_attach_unit(int unit, int flags)
 {
     sc_softc_t *sc;
     scr_stat *scp;
     struct cdev *dev;
     void *oldts, *ts;
     int i, vc;
 
     if (!vty_enabled(VTY_SC))
         return ENXIO;
 
     flags &= ~SC_KERNEL_CONSOLE;
 
     if (sc_console_unit == unit) {
 	/*
 	 * If this unit is being used as the system console, we need to
 	 * adjust some variables and buffers before and after scinit().
 	 */
 	/* assert(sc_console != NULL) */
 	flags |= SC_KERNEL_CONSOLE;
 	scmeminit(NULL);
 
 	scinit(unit, flags);
 
 	if (sc_console->tsw->te_size > 0) {
 	    sc_ktsw = sc_console->tsw;
 	    /* assert(sc_console->ts != NULL); */
 	    oldts = sc_console->ts;
 	    for (i = 0; i <= mp_maxid; i++) {
 		ts = malloc(sc_console->tsw->te_size, M_DEVBUF,
 			    M_WAITOK | M_ZERO);
 		(*sc_console->tsw->te_init)(sc_console, &ts, SC_TE_COLD_INIT);
 		sc_console->ts = ts;
 		(*sc_console->tsw->te_default_attr)(sc_console, sc_kattrtab[i],
 						    SC_KERNEL_CONS_REV_ATTR);
 		sc_kts[i] = ts;
 	    }
 	    sc_console->ts = oldts;
     	    (*sc_console->tsw->te_default_attr)(sc_console, SC_NORM_ATTR,
 						SC_NORM_REV_ATTR);
 	}
     } else {
 	scinit(unit, flags);
     }
 
     sc = sc_get_softc(unit, flags & SC_KERNEL_CONSOLE);
     sc->config = flags;
     callout_init(&sc->ctimeout, 0);
     callout_init(&sc->cblink, 0);
     scp = sc_get_stat(sc->dev[0]);
     if (sc_console == NULL)	/* sc_console_unit < 0 */
 	sc_console = scp;
 
 #ifdef SC_PIXEL_MODE
     if ((sc->config & SC_VESAMODE) != 0)
 	sc_set_vesa_mode(scp, sc, unit);
 #endif /* SC_PIXEL_MODE */
 
     /* initialize cursor */
     if (!ISGRAPHSC(scp))
     	update_cursor_image(scp);
 
     /* get screen update going */
     scrn_timer(sc);
 
     /* set up the keyboard */
     (void)kbdd_ioctl(sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
     update_kbd_state(scp, scp->status, LOCK_MASK);
 
     printf("%s%d: %s <%d virtual consoles, flags=0x%x>\n",
 	   SC_DRIVER_NAME, unit, adapter_name(sc->adp), sc->vtys, sc->config);
     if (bootverbose) {
 	printf("%s%d:", SC_DRIVER_NAME, unit);
     	if (sc->adapter >= 0)
 	    printf(" fb%d", sc->adapter);
 	if (sc->keyboard >= 0)
 	    printf(", kbd%d", sc->keyboard);
 	if (scp->tsw)
 	    printf(", terminal emulator: %s (%s)",
 		   scp->tsw->te_name, scp->tsw->te_desc);
 	printf("\n");
     }
 
     /* Register suspend/resume/shutdown callbacks for the kernel console. */
     if (sc_console_unit == unit) {
 	EVENTHANDLER_REGISTER(power_suspend_early, scsuspend, NULL,
 			      EVENTHANDLER_PRI_ANY);
 	EVENTHANDLER_REGISTER(power_resume, scresume, NULL,
 			      EVENTHANDLER_PRI_ANY);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, scshutdown, NULL,
 			      SHUTDOWN_PRI_DEFAULT);
     }
 
     for (vc = 0; vc < sc->vtys; vc++) {
 	if (sc->dev[vc] == NULL) {
 		sc->dev[vc] = sc_alloc_tty(vc, vc + unit * MAXCONS);
 		if (vc == 0 && sc->dev == main_devs)
 			SC_STAT(sc->dev[0]) = &main_console;
 	}
 	/*
 	 * The first vty already has struct tty and scr_stat initialized
 	 * in scinit().  The other vtys will have these structs when
 	 * first opened.
 	 */
     }
 
     dev = make_dev(&consolectl_devsw, 0, UID_ROOT, GID_WHEEL, 0600,
         "consolectl");
     dev->si_drv1 = sc->dev[0];
 
     return 0;
 }
 
 static void
 scmeminit(void *arg)
 {
     if (!vty_enabled(VTY_SC))
         return;
     if (sc_malloc)
 	return;
     sc_malloc = TRUE;
 
     /*
      * As soon as malloc() becomes functional, we had better allocate
      * various buffers for the kernel console.
      */
 
     if (sc_console_unit < 0)	/* sc_console == NULL */
 	return;
 
     /* copy the temporary buffer to the final buffer */
     sc_alloc_scr_buffer(sc_console, FALSE, FALSE);
 
 #ifndef SC_NO_CUTPASTE
     sc_alloc_cut_buffer(sc_console, FALSE);
 #endif
 
 #ifndef SC_NO_HISTORY
     /* initialize history buffer & pointers */
     sc_alloc_history_buffer(sc_console, 0, 0, FALSE);
 #endif
 }
 
 /* XXX */
 SYSINIT(sc_mem, SI_SUB_KMEM, SI_ORDER_ANY, scmeminit, NULL);
 
 static int
 scdevtounit(struct tty *tp)
 {
     int vty = SC_VTY(tp);
 
     if (vty == SC_CONSOLECTL)
 	return ((sc_console != NULL) ? sc_console->sc->unit : -1);
     else if ((vty < 0) || (vty >= MAXCONS*sc_max_unit()))
 	return -1;
     else
 	return vty/MAXCONS;
 }
 
 static int
 sctty_open(struct tty *tp)
 {
     int unit = scdevtounit(tp);
     sc_softc_t *sc;
     scr_stat *scp;
 #ifndef __sparc64__
     keyarg_t key;
 #endif
 
     DPRINTF(5, ("scopen: dev:%s, unit:%d, vty:%d\n",
 		devtoname(tp->t_dev), unit, SC_VTY(tp)));
 
     sc = sc_get_softc(unit, (sc_console_unit == unit) ? SC_KERNEL_CONSOLE : 0);
     if (sc == NULL)
 	return ENXIO;
 
     if (!tty_opened(tp)) {
         /* Use the current setting of the <-- key as default VERASE. */  
         /* If the Delete key is preferable, an stty is necessary     */
 #ifndef __sparc64__
 	if (sc->kbd != NULL) {
 	    key.keynum = KEYCODE_BS;
 	    (void)kbdd_ioctl(sc->kbd, GIO_KEYMAPENT, (caddr_t)&key);
             tp->t_termios.c_cc[VERASE] = key.key.map[0];
 	}
 #endif
     }
 
     scp = sc_get_stat(tp);
     if (scp == NULL) {
 	scp = SC_STAT(tp) = alloc_scp(sc, SC_VTY(tp));
 	if (ISGRAPHSC(scp))
 	    sc_set_pixel_mode(scp, NULL, 0, 0, 16, 8);
     }
     if (!tp->t_winsize.ws_col && !tp->t_winsize.ws_row) {
 	tp->t_winsize.ws_col = scp->xsize;
 	tp->t_winsize.ws_row = scp->ysize;
     }
 
     return (0);
 }
 
 static void
 sctty_close(struct tty *tp)
 {
     scr_stat *scp;
     int s;
 
     if (SC_VTY(tp) != SC_CONSOLECTL) {
 	scp = sc_get_stat(tp);
 	/* were we in the middle of the VT switching process? */
 	DPRINTF(5, ("sc%d: scclose(), ", scp->sc->unit));
 	s = spltty();
 	if ((scp == scp->sc->cur_scp) && (scp->sc->unit == sc_console_unit))
 	    cnavailable(sc_consptr, TRUE);
 	if (finish_vt_rel(scp, TRUE, &s) == 0)	/* force release */
 	    DPRINTF(5, ("reset WAIT_REL, "));
 	if (finish_vt_acq(scp) == 0)		/* force acknowledge */
 	    DPRINTF(5, ("reset WAIT_ACQ, "));
 #ifdef not_yet_done
 	if (scp == &main_console) {
 	    scp->pid = 0;
 	    scp->proc = NULL;
 	    scp->smode.mode = VT_AUTO;
 	}
 	else {
 	    sc_vtb_destroy(&scp->vtb);
 #ifndef __sparc64__
 	    sc_vtb_destroy(&scp->scr);
 #endif
 	    sc_free_history_buffer(scp, scp->ysize);
 	    SC_STAT(tp) = NULL;
 	    free(scp, M_DEVBUF);
 	}
 #else
 	scp->pid = 0;
 	scp->proc = NULL;
 	scp->smode.mode = VT_AUTO;
 #endif
 	scp->kbd_mode = K_XLATE;
 	if (scp == scp->sc->cur_scp)
 	    (void)kbdd_ioctl(scp->sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
 	DPRINTF(5, ("done.\n"));
     }
 }
 
 #if 0 /* XXX mpsafetty: fix screensaver. What about outwakeup? */
 static int
 scread(struct cdev *dev, struct uio *uio, int flag)
 {
     if (!sc_saver_keyb_only)
 	sc_touch_scrn_saver();
     return ttyread(dev, uio, flag);
 }
 #endif
 
 static int
 sckbdevent(keyboard_t *thiskbd, int event, void *arg)
 {
     sc_softc_t *sc;
     struct tty *cur_tty;
     int c, error = 0; 
     size_t len;
     const u_char *cp;
 
     sc = (sc_softc_t *)arg;
     /* assert(thiskbd == sc->kbd) */
 
     mtx_lock(&Giant);
 
     switch (event) {
     case KBDIO_KEYINPUT:
 	break;
     case KBDIO_UNLOADING:
 	sc->kbd = NULL;
 	sc->keyboard = -1;
 	kbd_release(thiskbd, (void *)&sc->keyboard);
 	goto done;
     default:
 	error = EINVAL;
 	goto done;
     }
 
     /* 
      * Loop while there is still input to get from the keyboard.
      * I don't think this is nessesary, and it doesn't fix
      * the Xaccel-2.1 keyboard hang, but it can't hurt.		XXX
      */
     while ((c = scgetc(sc, SCGETC_NONBLOCK, NULL)) != NOKEY) {
 
 	cur_tty = SC_DEV(sc, sc->cur_scp->index);
 	if (!tty_opened_ns(cur_tty))
 	    continue;
 
 	if ((*sc->cur_scp->tsw->te_input)(sc->cur_scp, c, cur_tty))
 	    continue;
 
 	switch (KEYFLAGS(c)) {
 	case 0x0000: /* normal key */
 	    ttydisc_rint(cur_tty, KEYCHAR(c), 0);
 	    break;
 	case FKEY:  /* function key, return string */
 	    cp = (*sc->cur_scp->tsw->te_fkeystr)(sc->cur_scp, c);
 	    if (cp != NULL) {
 	    	ttydisc_rint_simple(cur_tty, cp, strlen(cp));
 		break;
 	    }
 	    cp = kbdd_get_fkeystr(thiskbd, KEYCHAR(c), &len);
 	    if (cp != NULL)
 	    	ttydisc_rint_simple(cur_tty, cp, len);
 	    break;
 	case MKEY:  /* meta is active, prepend ESC */
 	    ttydisc_rint(cur_tty, 0x1b, 0);
 	    ttydisc_rint(cur_tty, KEYCHAR(c), 0);
 	    break;
 	case BKEY:  /* backtab fixed sequence (esc [ Z) */
 	    ttydisc_rint_simple(cur_tty, "\x1B[Z", 3);
 	    break;
 	}
 
 	ttydisc_rint_done(cur_tty);
     }
 
     sc->cur_scp->status |= MOUSE_HIDDEN;
 
 done:
     mtx_unlock(&Giant);
     return (error);
 }
 
 static int
 sctty_ioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td)
 {
     int error;
     int i;
     struct cursor_attr *cap;
     sc_softc_t *sc;
     scr_stat *scp;
     int s;
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     int ival;
 #endif
 
     /* If there is a user_ioctl function call that first */
     if (sc_user_ioctl) {
 	error = (*sc_user_ioctl)(tp, cmd, data, td);
 	if (error != ENOIOCTL)
 	    return error;
     }
 
     error = sc_vid_ioctl(tp, cmd, data, td);
     if (error != ENOIOCTL)
 	return error;
 
 #ifndef SC_NO_HISTORY
     error = sc_hist_ioctl(tp, cmd, data, td);
     if (error != ENOIOCTL)
 	return error;
 #endif
 
 #ifndef SC_NO_SYSMOUSE
     error = sc_mouse_ioctl(tp, cmd, data, td);
     if (error != ENOIOCTL)
 	return error;
 #endif
 
     scp = sc_get_stat(tp);
     /* assert(scp != NULL) */
     /* scp is sc_console, if SC_VTY(dev) == SC_CONSOLECTL. */
     sc = scp->sc;
 
     if (scp->tsw) {
 	error = (*scp->tsw->te_ioctl)(scp, tp, cmd, data, td);
 	if (error != ENOIOCTL)
 	    return error;
     }
 
     switch (cmd) {  		/* process console hardware related ioctl's */
 
     case GIO_ATTR:      	/* get current attributes */
 	/* this ioctl is not processed here, but in the terminal emulator */
 	return ENOTTY;
 
     case GIO_COLOR:     	/* is this a color console ? */
 	*(int *)data = (sc->adp->va_flags & V_ADP_COLOR) ? 1 : 0;
 	return 0;
 
     case CONS_BLANKTIME:    	/* set screen saver timeout (0 = no saver) */
 	if (*(int *)data < 0 || *(int *)data > MAX_BLANKTIME)
             return EINVAL;
 	s = spltty();
 	scrn_blank_time = *(int *)data;
 	run_scrn_saver = (scrn_blank_time != 0);
 	splx(s);
 	return 0;
 
     case CONS_CURSORTYPE:   	/* set cursor type (old interface + HIDDEN) */
 	s = spltty();
 	*(int *)data &= CONS_CURSOR_ATTRS;
 	sc_change_cursor_shape(scp, *(int *)data, -1, -1);
 	splx(s);
 	return 0;
 
     case CONS_GETCURSORSHAPE:   /* get cursor shape (new interface) */
 	switch (((int *)data)[0] & (CONS_DEFAULT_CURSOR | CONS_LOCAL_CURSOR)) {
 	case 0:
 	    cap = &sc->curs_attr;
 	    break;
 	case CONS_LOCAL_CURSOR:
 	    cap = &scp->base_curs_attr;
 	    break;
 	case CONS_DEFAULT_CURSOR:
 	    cap = &sc->dflt_curs_attr;
 	    break;
 	case CONS_DEFAULT_CURSOR | CONS_LOCAL_CURSOR:
 	    cap = &scp->dflt_curs_attr;
 	    break;
 	}
 	if (((int *)data)[0] & CONS_CHARCURSOR_COLORS) {
 	    ((int *)data)[1] = cap->bg[0];
 	    ((int *)data)[2] = cap->bg[1];
 	} else if (((int *)data)[0] & CONS_MOUSECURSOR_COLORS) {
 	    ((int *)data)[1] = cap->mouse_ba;
 	    ((int *)data)[2] = cap->mouse_ia;
 	} else {
 	    ((int *)data)[1] = cap->base;
 	    ((int *)data)[2] = cap->height;
 	}
 	((int *)data)[0] = cap->flags;
 	return 0;
 
     case CONS_SETCURSORSHAPE:   /* set cursor shape (new interface) */
 	s = spltty();
 	sc_change_cursor_shape(scp, ((int *)data)[0],
 	    ((int *)data)[1], ((int *)data)[2]);
 	splx(s);
 	return 0;
 
     case CONS_BELLTYPE: 	/* set bell type sound/visual */
 	if ((*(int *)data) & CONS_VISUAL_BELL)
 	    sc->flags |= SC_VISUAL_BELL;
 	else
 	    sc->flags &= ~SC_VISUAL_BELL;
 	if ((*(int *)data) & CONS_QUIET_BELL)
 	    sc->flags |= SC_QUIET_BELL;
 	else
 	    sc->flags &= ~SC_QUIET_BELL;
 	return 0;
 
     case CONS_GETINFO:  	/* get current (virtual) console info */
     {
 	vid_info_t *ptr = (vid_info_t*)data;
 	if (ptr->size == sizeof(struct vid_info)) {
 	    ptr->m_num = sc->cur_scp->index;
 	    ptr->font_size = scp->font_size;
 	    ptr->mv_col = scp->xpos;
 	    ptr->mv_row = scp->ypos;
 	    ptr->mv_csz = scp->xsize;
 	    ptr->mv_rsz = scp->ysize;
 	    ptr->mv_hsz = (scp->history != NULL) ? scp->history->vtb_rows : 0;
 	    /*
 	     * The following fields are filled by the terminal emulator. XXX
 	     *
 	     * ptr->mv_norm.fore
 	     * ptr->mv_norm.back
 	     * ptr->mv_rev.fore
 	     * ptr->mv_rev.back
 	     */
 	    ptr->mv_grfc.fore = 0;      /* not supported */
 	    ptr->mv_grfc.back = 0;      /* not supported */
 	    ptr->mv_ovscan = scp->border;
 	    if (scp == sc->cur_scp)
 		save_kbd_state(scp);
 	    ptr->mk_keylock = scp->status & LOCK_MASK;
 	    return 0;
 	}
 	return EINVAL;
     }
 
     case CONS_GETVERS:  	/* get version number */
 	*(int*)data = 0x200;    /* version 2.0 */
 	return 0;
 
     case CONS_IDLE:		/* see if the screen has been idle */
 	/*
 	 * When the screen is in the GRAPHICS_MODE or UNKNOWN_MODE,
 	 * the user process may have been writing something on the
 	 * screen and syscons is not aware of it. Declare the screen
 	 * is NOT idle if it is in one of these modes. But there is
 	 * an exception to it; if a screen saver is running in the 
 	 * graphics mode in the current screen, we should say that the
 	 * screen has been idle.
 	 */
 	*(int *)data = (sc->flags & SC_SCRN_IDLE)
 		       && (!ISGRAPHSC(sc->cur_scp)
 			   || (sc->cur_scp->status & SAVER_RUNNING));
 	return 0;
 
     case CONS_SAVERMODE:	/* set saver mode */
 	switch(*(int *)data) {
 	case CONS_NO_SAVER:
 	case CONS_USR_SAVER:
 	    /* if a LKM screen saver is running, stop it first. */
 	    scsplash_stick(FALSE);
 	    saver_mode = *(int *)data;
 	    s = spltty();
 #ifdef DEV_SPLASH
 	    if ((error = wait_scrn_saver_stop(NULL))) {
 		splx(s);
 		return error;
 	    }
 #endif
 	    run_scrn_saver = TRUE;
 	    if (saver_mode == CONS_USR_SAVER)
 		scp->status |= SAVER_RUNNING;
 	    else
 		scp->status &= ~SAVER_RUNNING;
 	    scsplash_stick(TRUE);
 	    splx(s);
 	    break;
 	case CONS_LKM_SAVER:
 	    s = spltty();
 	    if ((saver_mode == CONS_USR_SAVER) && (scp->status & SAVER_RUNNING))
 		scp->status &= ~SAVER_RUNNING;
 	    saver_mode = *(int *)data;
 	    splx(s);
 	    break;
 	default:
 	    return EINVAL;
 	}
 	return 0;
 
     case CONS_SAVERSTART:	/* immediately start/stop the screen saver */
 	/*
 	 * Note that this ioctl does not guarantee the screen saver 
 	 * actually starts or stops. It merely attempts to do so...
 	 */
 	s = spltty();
 	run_scrn_saver = (*(int *)data != 0);
 	if (run_scrn_saver)
 	    sc->scrn_time_stamp -= scrn_blank_time;
 	splx(s);
 	return 0;
 
     case CONS_SCRSHOT:		/* get a screen shot */
     {
 	int retval, hist_rsz;
 	size_t lsize, csize;
 	vm_offset_t frbp, hstp;
 	unsigned lnum;
 	scrshot_t *ptr = (scrshot_t *)data;
 	void *outp = ptr->buf;
 
 	if (ptr->x < 0 || ptr->y < 0 || ptr->xsize < 0 || ptr->ysize < 0)
 		return EINVAL;
 	s = spltty();
 	if (ISGRAPHSC(scp)) {
 	    splx(s);
 	    return EOPNOTSUPP;
 	}
 	hist_rsz = (scp->history != NULL) ? scp->history->vtb_rows : 0;
 	if (((u_int)ptr->x + ptr->xsize) > scp->xsize ||
 	    ((u_int)ptr->y + ptr->ysize) > (scp->ysize + hist_rsz)) {
 	    splx(s);
 	    return EINVAL;
 	}
 
 	lsize = scp->xsize * sizeof(u_int16_t);
 	csize = ptr->xsize * sizeof(u_int16_t);
 	/* Pointer to the last line of framebuffer */
 	frbp = scp->vtb.vtb_buffer + scp->ysize * lsize + ptr->x *
 	       sizeof(u_int16_t);
 	/* Pointer to the last line of target buffer */
 	outp = (char *)outp + ptr->ysize * csize;
 	/* Pointer to the last line of history buffer */
 	if (scp->history != NULL)
 	    hstp = scp->history->vtb_buffer + sc_vtb_tail(scp->history) *
 		sizeof(u_int16_t) + ptr->x * sizeof(u_int16_t);
 	else
 	    hstp = 0;
 
 	retval = 0;
 	for (lnum = 0; lnum < (ptr->y + ptr->ysize); lnum++) {
 	    if (lnum < scp->ysize) {
 		frbp -= lsize;
 	    } else {
 		hstp -= lsize;
 		if (hstp < scp->history->vtb_buffer)
 		    hstp += scp->history->vtb_rows * lsize;
 		frbp = hstp;
 	    }
 	    if (lnum < ptr->y)
 		continue;
 	    outp = (char *)outp - csize;
 	    retval = copyout((void *)frbp, outp, csize);
 	    if (retval != 0)
 		break;
 	}
 	splx(s);
 	return retval;
     }
 
     case VT_SETMODE:    	/* set screen switcher mode */
     {
 	struct vt_mode *mode;
 	struct proc *p1;
 
 	mode = (struct vt_mode *)data;
 	DPRINTF(5, ("%s%d: VT_SETMODE ", SC_DRIVER_NAME, sc->unit));
 	if (scp->smode.mode == VT_PROCESS) {
 	    p1 = pfind(scp->pid);
     	    if (scp->proc == p1 && scp->proc != td->td_proc) {
 		if (p1)
 		    PROC_UNLOCK(p1);
 		DPRINTF(5, ("error EPERM\n"));
 		return EPERM;
 	    }
 	    if (p1)
 		PROC_UNLOCK(p1);
 	}
 	s = spltty();
 	if (mode->mode == VT_AUTO) {
 	    scp->smode.mode = VT_AUTO;
 	    scp->proc = NULL;
 	    scp->pid = 0;
 	    DPRINTF(5, ("VT_AUTO, "));
 	    if ((scp == sc->cur_scp) && (sc->unit == sc_console_unit))
 		cnavailable(sc_consptr, TRUE);
 	    /* were we in the middle of the vty switching process? */
 	    if (finish_vt_rel(scp, TRUE, &s) == 0)
 		DPRINTF(5, ("reset WAIT_REL, "));
 	    if (finish_vt_acq(scp) == 0)
 		DPRINTF(5, ("reset WAIT_ACQ, "));
 	} else {
 	    if (!ISSIGVALID(mode->relsig) || !ISSIGVALID(mode->acqsig)
 		|| !ISSIGVALID(mode->frsig)) {
 		splx(s);
 		DPRINTF(5, ("error EINVAL\n"));
 		return EINVAL;
 	    }
 	    DPRINTF(5, ("VT_PROCESS %d, ", td->td_proc->p_pid));
 	    bcopy(data, &scp->smode, sizeof(struct vt_mode));
 	    scp->proc = td->td_proc;
 	    scp->pid = scp->proc->p_pid;
 	    if ((scp == sc->cur_scp) && (sc->unit == sc_console_unit))
 		cnavailable(sc_consptr, FALSE);
 	}
 	splx(s);
 	DPRINTF(5, ("\n"));
 	return 0;
     }
 
     case VT_GETMODE:    	/* get screen switcher mode */
 	bcopy(&scp->smode, data, sizeof(struct vt_mode));
 	return 0;
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     case _IO('v', 4):
 	ival = IOCPARM_IVAL(data);
 	data = (caddr_t)&ival;
 	/* FALLTHROUGH */
 #endif
     case VT_RELDISP:    	/* screen switcher ioctl */
 	s = spltty();
 	/*
 	 * This must be the current vty which is in the VT_PROCESS
 	 * switching mode...
 	 */
 	if ((scp != sc->cur_scp) || (scp->smode.mode != VT_PROCESS)) {
 	    splx(s);
 	    return EINVAL;
 	}
 	/* ...and this process is controlling it. */
 	if (scp->proc != td->td_proc) {
 	    splx(s);
 	    return EPERM;
 	}
 	error = EINVAL;
 	switch(*(int *)data) {
 	case VT_FALSE:  	/* user refuses to release screen, abort */
 	    if ((error = finish_vt_rel(scp, FALSE, &s)) == 0)
 		DPRINTF(5, ("%s%d: VT_FALSE\n", SC_DRIVER_NAME, sc->unit));
 	    break;
 	case VT_TRUE:   	/* user has released screen, go on */
 	    if ((error = finish_vt_rel(scp, TRUE, &s)) == 0)
 		DPRINTF(5, ("%s%d: VT_TRUE\n", SC_DRIVER_NAME, sc->unit));
 	    break;
 	case VT_ACKACQ: 	/* acquire acknowledged, switch completed */
 	    if ((error = finish_vt_acq(scp)) == 0)
 		DPRINTF(5, ("%s%d: VT_ACKACQ\n", SC_DRIVER_NAME, sc->unit));
 	    break;
 	default:
 	    break;
 	}
 	splx(s);
 	return error;
 
     case VT_OPENQRY:    	/* return free virtual console */
 	for (i = sc->first_vty; i < sc->first_vty + sc->vtys; i++) {
 	    tp = SC_DEV(sc, i);
 	    if (!tty_opened_ns(tp)) {
 		*(int *)data = i + 1;
 		return 0;
 	    }
 	}
 	return EINVAL;
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     case _IO('v', 5):
 	ival = IOCPARM_IVAL(data);
 	data = (caddr_t)&ival;
 	/* FALLTHROUGH */
 #endif
     case VT_ACTIVATE:   	/* switch to screen *data */
 	i = (*(int *)data == 0) ? scp->index : (*(int *)data - 1);
 	s = spltty();
 	error = sc_clean_up(sc->cur_scp);
 	splx(s);
 	if (error)
 	    return error;
 	error = sc_switch_scr(sc, i);
 	return (error);
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     case _IO('v', 6):
 	ival = IOCPARM_IVAL(data);
 	data = (caddr_t)&ival;
 	/* FALLTHROUGH */
 #endif
     case VT_WAITACTIVE: 	/* wait for switch to occur */
 	i = (*(int *)data == 0) ? scp->index : (*(int *)data - 1);
 	if ((i < sc->first_vty) || (i >= sc->first_vty + sc->vtys))
 	    return EINVAL;
 	if (i == sc->cur_scp->index)
 	    return 0;
 	error = tsleep(VTY_WCHAN(sc, i), (PZERO + 1) | PCATCH, "waitvt", 0);
 	return error;
 
     case VT_GETACTIVE:		/* get active vty # */
 	*(int *)data = sc->cur_scp->index + 1;
 	return 0;
 
     case VT_GETINDEX:		/* get this vty # */
 	*(int *)data = scp->index + 1;
 	return 0;
 
     case VT_LOCKSWITCH:		/* prevent vty switching */
 	if ((*(int *)data) & 0x01)
 	    sc->flags |= SC_SCRN_VTYLOCK;
 	else
 	    sc->flags &= ~SC_SCRN_VTYLOCK;
 	return 0;
 
     case KDENABIO:      	/* allow io operations */
 	error = priv_check(td, PRIV_IO);
 	if (error != 0)
 	    return error;
 	error = securelevel_gt(td->td_ucred, 0);
 	if (error != 0)
 		return error;
 #ifdef __i386__
 	td->td_frame->tf_eflags |= PSL_IOPL;
 #elif defined(__amd64__)
 	td->td_frame->tf_rflags |= PSL_IOPL;
 #endif
 	return 0;
 
     case KDDISABIO:     	/* disallow io operations (default) */
 #ifdef __i386__
 	td->td_frame->tf_eflags &= ~PSL_IOPL;
 #elif defined(__amd64__)
 	td->td_frame->tf_rflags &= ~PSL_IOPL;
 #endif
 	return 0;
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     case _IO('K', 20):
 	ival = IOCPARM_IVAL(data);
 	data = (caddr_t)&ival;
 	/* FALLTHROUGH */
 #endif
     case KDSKBSTATE:    	/* set keyboard state (locks) */
 	if (*(int *)data & ~LOCK_MASK)
 	    return EINVAL;
 	scp->status &= ~LOCK_MASK;
 	scp->status |= *(int *)data;
 	if (scp == sc->cur_scp)
 	    update_kbd_state(scp, scp->status, LOCK_MASK);
 	return 0;
 
     case KDGKBSTATE:    	/* get keyboard state (locks) */
 	if (scp == sc->cur_scp)
 	    save_kbd_state(scp);
 	*(int *)data = scp->status & LOCK_MASK;
 	return 0;
 
     case KDGETREPEAT:      	/* get keyboard repeat & delay rates */
     case KDSETREPEAT:      	/* set keyboard repeat & delay rates (new) */
 	error = kbdd_ioctl(sc->kbd, cmd, data);
 	if (error == ENOIOCTL)
 	    error = ENODEV;
 	return error;
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     case _IO('K', 67):
 	ival = IOCPARM_IVAL(data);
 	data = (caddr_t)&ival;
 	/* FALLTHROUGH */
 #endif
     case KDSETRAD:      	/* set keyboard repeat & delay rates (old) */
 	if (*(int *)data & ~0x7f)
 	    return EINVAL;
 	error = kbdd_ioctl(sc->kbd, KDSETRAD, data);
 	if (error == ENOIOCTL)
 	    error = ENODEV;
 	return error;
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     case _IO('K', 7):
 	ival = IOCPARM_IVAL(data);
 	data = (caddr_t)&ival;
 	/* FALLTHROUGH */
 #endif
     case KDSKBMODE:     	/* set keyboard mode */
 	switch (*(int *)data) {
 	case K_XLATE:   	/* switch to XLT ascii mode */
 	case K_RAW: 		/* switch to RAW scancode mode */
 	case K_CODE: 		/* switch to CODE mode */
 	    scp->kbd_mode = *(int *)data;
 	    if (scp == sc->cur_scp)
 		(void)kbdd_ioctl(sc->kbd, KDSKBMODE, data);
 	    return 0;
 	default:
 	    return EINVAL;
 	}
 	/* NOT REACHED */
 
     case KDGKBMODE:     	/* get keyboard mode */
 	*(int *)data = scp->kbd_mode;
 	return 0;
 
     case KDGKBINFO:
 	error = kbdd_ioctl(sc->kbd, cmd, data);
 	if (error == ENOIOCTL)
 	    error = ENODEV;
 	return error;
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     case _IO('K', 8):
 	ival = IOCPARM_IVAL(data);
 	data = (caddr_t)&ival;
 	/* FALLTHROUGH */
 #endif
     case KDMKTONE:      	/* sound the bell */
 	if (*(int*)data)
 	    sc_bell(scp, (*(int*)data)&0xffff,
 		    (((*(int*)data)>>16)&0xffff)*hz/1000);
 	else
 	    sc_bell(scp, scp->bell_pitch, scp->bell_duration);
 	return 0;
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     case _IO('K', 63):
 	ival = IOCPARM_IVAL(data);
 	data = (caddr_t)&ival;
 	/* FALLTHROUGH */
 #endif
     case KIOCSOUND:     	/* make tone (*data) hz */
 	if (scp == sc->cur_scp) {
 	    if (*(int *)data)
 		return sc_tone(*(int *)data);
 	    else
 		return sc_tone(0);
 	}
 	return 0;
 
     case KDGKBTYPE:     	/* get keyboard type */
 	error = kbdd_ioctl(sc->kbd, cmd, data);
 	if (error == ENOIOCTL) {
 	    /* always return something? XXX */
 	    *(int *)data = 0;
 	}
 	return 0;
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     case _IO('K', 66):
 	ival = IOCPARM_IVAL(data);
 	data = (caddr_t)&ival;
 	/* FALLTHROUGH */
 #endif
     case KDSETLED:      	/* set keyboard LED status */
 	if (*(int *)data & ~LED_MASK)	/* FIXME: LOCK_MASK? */
 	    return EINVAL;
 	scp->status &= ~LED_MASK;
 	scp->status |= *(int *)data;
 	if (scp == sc->cur_scp)
 	    update_kbd_leds(scp, scp->status);
 	return 0;
 
     case KDGETLED:      	/* get keyboard LED status */
 	if (scp == sc->cur_scp)
 	    save_kbd_state(scp);
 	*(int *)data = scp->status & LED_MASK;
 	return 0;
 
     case KBADDKBD:		/* add/remove keyboard to/from mux */
     case KBRELKBD:
 	error = kbdd_ioctl(sc->kbd, cmd, data);
 	if (error == ENOIOCTL)
 	    error = ENODEV;
 	return error;
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
     case _IO('c', 110):
 	ival = IOCPARM_IVAL(data);
 	data = (caddr_t)&ival;
 	/* FALLTHROUGH */
 #endif
     case CONS_SETKBD: 		/* set the new keyboard */
 	{
 	    keyboard_t *newkbd;
 
 	    s = spltty();
 	    newkbd = kbd_get_keyboard(*(int *)data);
 	    if (newkbd == NULL) {
 		splx(s);
 		return EINVAL;
 	    }
 	    error = 0;
 	    if (sc->kbd != newkbd) {
 		i = kbd_allocate(newkbd->kb_name, newkbd->kb_unit,
 				 (void *)&sc->keyboard, sckbdevent, sc);
 		/* i == newkbd->kb_index */
 		if (i >= 0) {
 		    if (sc->kbd != NULL) {
 			save_kbd_state(sc->cur_scp);
 			kbd_release(sc->kbd, (void *)&sc->keyboard);
 		    }
 		    sc->kbd = kbd_get_keyboard(i); /* sc->kbd == newkbd */
 		    sc->keyboard = i;
 		    (void)kbdd_ioctl(sc->kbd, KDSKBMODE,
 			      (caddr_t)&sc->cur_scp->kbd_mode);
 		    update_kbd_state(sc->cur_scp, sc->cur_scp->status,
 				     LOCK_MASK);
 		} else {
 		    error = EPERM;	/* XXX */
 		}
 	    }
 	    splx(s);
 	    return error;
 	}
 
     case CONS_RELKBD: 		/* release the current keyboard */
 	s = spltty();
 	error = 0;
 	if (sc->kbd != NULL) {
 	    save_kbd_state(sc->cur_scp);
 	    error = kbd_release(sc->kbd, (void *)&sc->keyboard);
 	    if (error == 0) {
 		sc->kbd = NULL;
 		sc->keyboard = -1;
 	    }
 	}
 	splx(s);
 	return error;
 
     case CONS_GETTERM:		/* get the current terminal emulator info */
 	{
 	    sc_term_sw_t *sw;
 
 	    if (((term_info_t *)data)->ti_index == 0) {
 		sw = scp->tsw;
 	    } else {
 		sw = sc_term_match_by_number(((term_info_t *)data)->ti_index);
 	    }
 	    if (sw != NULL) {
 		strncpy(((term_info_t *)data)->ti_name, sw->te_name, 
 			sizeof(((term_info_t *)data)->ti_name));
 		strncpy(((term_info_t *)data)->ti_desc, sw->te_desc, 
 			sizeof(((term_info_t *)data)->ti_desc));
 		((term_info_t *)data)->ti_flags = 0;
 		return 0;
 	    } else {
 		((term_info_t *)data)->ti_name[0] = '\0';
 		((term_info_t *)data)->ti_desc[0] = '\0';
 		((term_info_t *)data)->ti_flags = 0;
 		return EINVAL;
 	    }
 	}
 
     case CONS_SETTERM:		/* set the current terminal emulator */
 	s = spltty();
 	error = sc_init_emulator(scp, ((term_info_t *)data)->ti_name);
 	/* FIXME: what if scp == sc_console! XXX */
 	splx(s);
 	return error;
 
     case GIO_SCRNMAP:   	/* get output translation table */
 	bcopy(&sc->scr_map, data, sizeof(sc->scr_map));
 	return 0;
 
     case PIO_SCRNMAP:   	/* set output translation table */
 	bcopy(data, &sc->scr_map, sizeof(sc->scr_map));
 	for (i=0; i<sizeof(sc->scr_map); i++) {
 	    sc->scr_rmap[sc->scr_map[i]] = i;
 	}
 	return 0;
 
     case GIO_KEYMAP:		/* get keyboard translation table */
     case PIO_KEYMAP:		/* set keyboard translation table */
     case OGIO_KEYMAP:		/* get keyboard translation table (compat) */
     case OPIO_KEYMAP:		/* set keyboard translation table (compat) */
     case GIO_DEADKEYMAP:	/* get accent key translation table */
     case PIO_DEADKEYMAP:	/* set accent key translation table */
     case GETFKEY:		/* get function key string */
     case SETFKEY:		/* set function key string */
 	error = kbdd_ioctl(sc->kbd, cmd, data);
 	if (error == ENOIOCTL)
 	    error = ENODEV;
 	return error;
 
 #ifndef SC_NO_FONT_LOADING
 
     case PIO_FONT8x8:   	/* set 8x8 dot font */
 	if (!ISFONTAVAIL(sc->adp->va_flags))
 	    return ENXIO;
 	bcopy(data, sc->font_8, 8*256);
 	sc->fonts_loaded |= FONT_8;
 	/*
 	 * FONT KLUDGE
 	 * Always use the font page #0. XXX
 	 * Don't load if the current font size is not 8x8.
 	 */
 	if (ISTEXTSC(sc->cur_scp) && (sc->cur_scp->font_size < 14))
 	    sc_load_font(sc->cur_scp, 0, 8, 8, sc->font_8, 0, 256);
 	return 0;
 
     case GIO_FONT8x8:   	/* get 8x8 dot font */
 	if (!ISFONTAVAIL(sc->adp->va_flags))
 	    return ENXIO;
 	if (sc->fonts_loaded & FONT_8) {
 	    bcopy(sc->font_8, data, 8*256);
 	    return 0;
 	}
 	else
 	    return ENXIO;
 
     case PIO_FONT8x14:  	/* set 8x14 dot font */
 	if (!ISFONTAVAIL(sc->adp->va_flags))
 	    return ENXIO;
 	bcopy(data, sc->font_14, 14*256);
 	sc->fonts_loaded |= FONT_14;
 	/*
 	 * FONT KLUDGE
 	 * Always use the font page #0. XXX
 	 * Don't load if the current font size is not 8x14.
 	 */
 	if (ISTEXTSC(sc->cur_scp)
 	    && (sc->cur_scp->font_size >= 14)
 	    && (sc->cur_scp->font_size < 16))
 	    sc_load_font(sc->cur_scp, 0, 14, 8, sc->font_14, 0, 256);
 	return 0;
 
     case GIO_FONT8x14:  	/* get 8x14 dot font */
 	if (!ISFONTAVAIL(sc->adp->va_flags))
 	    return ENXIO;
 	if (sc->fonts_loaded & FONT_14) {
 	    bcopy(sc->font_14, data, 14*256);
 	    return 0;
 	}
 	else
 	    return ENXIO;
 
     case PIO_FONT8x16:  	/* set 8x16 dot font */
 	if (!ISFONTAVAIL(sc->adp->va_flags))
 	    return ENXIO;
 	bcopy(data, sc->font_16, 16*256);
 	sc->fonts_loaded |= FONT_16;
 	/*
 	 * FONT KLUDGE
 	 * Always use the font page #0. XXX
 	 * Don't load if the current font size is not 8x16.
 	 */
 	if (ISTEXTSC(sc->cur_scp) && (sc->cur_scp->font_size >= 16))
 	    sc_load_font(sc->cur_scp, 0, 16, 8, sc->font_16, 0, 256);
 	return 0;
 
     case GIO_FONT8x16:  	/* get 8x16 dot font */
 	if (!ISFONTAVAIL(sc->adp->va_flags))
 	    return ENXIO;
 	if (sc->fonts_loaded & FONT_16) {
 	    bcopy(sc->font_16, data, 16*256);
 	    return 0;
 	}
 	else
 	    return ENXIO;
 
 #endif /* SC_NO_FONT_LOADING */
 
     default:
 	break;
     }
 
     return (ENOIOCTL);
 }
 
 static int
 consolectl_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 
 	return sctty_ioctl(dev->si_drv1, cmd, data, td);
 }
 
 static int
 consolectl_close(struct cdev *dev, int flags, int mode, struct thread *td)
 {
 #ifndef SC_NO_SYSMOUSE
 	mouse_info_t info;
 	memset(&info, 0, sizeof(info));
 	info.operation = MOUSE_ACTION;
 
 	/*
 	 * Make sure all buttons are released when moused and other
 	 * console daemons exit, so that no buttons are left pressed.
 	 */
 	(void) sctty_ioctl(dev->si_drv1, CONS_MOUSECTL, (caddr_t)&info, td);
 #endif
 	return (0);
 }
 
 static void
 sc_cnprobe(struct consdev *cp)
 {
     int unit;
     int flags;
 
     if (!vty_enabled(VTY_SC)) {
 	cp->cn_pri = CN_DEAD;
 	return;
     }
 
     cp->cn_pri = sc_get_cons_priority(&unit, &flags);
 
     /* a video card is always required */
     if (!scvidprobe(unit, flags, TRUE))
 	cp->cn_pri = CN_DEAD;
 
     /* syscons will become console even when there is no keyboard */
     sckbdprobe(unit, flags, TRUE);
 
     if (cp->cn_pri == CN_DEAD)
 	return;
 
     /* initialize required fields */
     strcpy(cp->cn_name, "ttyv0");
 }
 
 static void
 sc_cninit(struct consdev *cp)
 {
     int unit;
     int flags;
 
     sc_get_cons_priority(&unit, &flags);
     scinit(unit, flags | SC_KERNEL_CONSOLE);
     sc_console_unit = unit;
     sc_console = sc_get_stat(sc_get_softc(unit, SC_KERNEL_CONSOLE)->dev[0]);
     sc_consptr = cp;
 }
 
 static void
 sc_cnterm(struct consdev *cp)
 {
     void *ts;
     int i;
 
     /* we are not the kernel console any more, release everything */
 
     if (sc_console_unit < 0)
 	return;			/* shouldn't happen */
 
 #if 0 /* XXX */
     sc_clear_screen(sc_console);
     sccnupdate(sc_console);
 #endif
 
     if (sc_ktsw != NULL) {
 	for (i = 0; i <= mp_maxid; i++) {
 	    ts = sc_kts[i];
 	    sc_kts[i] = NULL;
 	    (*sc_ktsw->te_term)(sc_console, &ts);
 	    free(ts, M_DEVBUF);
 	}
 	sc_ktsw = NULL;
     }
     scterm(sc_console_unit, SC_KERNEL_CONSOLE);
     sc_console_unit = -1;
     sc_console = NULL;
 }
 
 static void sccnclose(sc_softc_t *sc, struct sc_cnstate *sp);
 static int sc_cngetc_locked(struct sc_cnstate *sp);
 static void sccnkbdlock(sc_softc_t *sc, struct sc_cnstate *sp);
 static void sccnkbdunlock(sc_softc_t *sc, struct sc_cnstate *sp);
 static void sccnopen(sc_softc_t *sc, struct sc_cnstate *sp, int flags);
 static void sccnscrlock(sc_softc_t *sc, struct sc_cnstate *sp);
 static void sccnscrunlock(sc_softc_t *sc, struct sc_cnstate *sp);
 
 static void
 sccnkbdlock(sc_softc_t *sc, struct sc_cnstate *sp)
 {
     /*
      * Locking method: hope for the best.
      * The keyboard is supposed to be Giant locked.  We can't handle that
      * in general.  The kdb_active case here is not safe, and we will
      * proceed without the lock in all cases.
      */
     sp->kbd_locked = !kdb_active && mtx_trylock(&Giant);
 }
 
 static void
 sccnkbdunlock(sc_softc_t *sc, struct sc_cnstate *sp)
 {
     if (sp->kbd_locked)
 	mtx_unlock(&Giant);
     sp->kbd_locked = FALSE;
 }
 
 static void
 sccnscrlock(sc_softc_t *sc, struct sc_cnstate *sp)
 {
     int retries;
 
     /**
      * Locking method:
      * - if kdb_active and video_mtx is not owned by anyone, then lock
      *   by kdb remaining active
      * - if !kdb_active, try to acquire video_mtx without blocking or
      *   recursing; if we get it then it works normally.
      * Note that video_mtx is especially unusable if we already own it,
      * since then it is protecting something and syscons is not reentrant
      * enough to ignore the protection even in the kdb_active case.
      */
     if (kdb_active) {
 	sp->kdb_locked = sc->video_mtx.mtx_lock == MTX_UNOWNED || panicstr;
 	sp->mtx_locked = FALSE;
     } else {
 	sp->kdb_locked = FALSE;
 	for (retries = 0; retries < 1000; retries++) {
 	    sp->mtx_locked = mtx_trylock_spin_flags(&sc->video_mtx,
 		MTX_QUIET) != 0 || panicstr;
 	    if (sp->mtx_locked)
 		break;
 	    DELAY(1);
 	}
     }
 }
 
 static void
 sccnscrunlock(sc_softc_t *sc, struct sc_cnstate *sp)
 {
     if (sp->mtx_locked)
 	mtx_unlock_spin(&sc->video_mtx);
     sp->mtx_locked = sp->kdb_locked = FALSE;
 }
 
 static void
 sccnopen(sc_softc_t *sc, struct sc_cnstate *sp, int flags)
 {
     int kbd_mode;
 
     /* assert(sc_console_unit >= 0) */
 
     sp->kbd_opened = FALSE;
     sp->scr_opened = FALSE;
     sp->kbd_locked = FALSE;
 
     /* Opening the keyboard is optional. */
     if (!(flags & 1) || sc->kbd == NULL)
 	goto over_keyboard;
 
     sccnkbdlock(sc, sp);
 
     /*
      * Make sure the keyboard is accessible even when the kbd device
      * driver is disabled.
      */
     kbdd_enable(sc->kbd);
 
     /* Switch the keyboard to console mode (K_XLATE, polled) on all scp's. */
     kbd_mode = K_XLATE;
     (void)kbdd_ioctl(sc->kbd, KDSKBMODE, (caddr_t)&kbd_mode);
     sc->kbd_open_level++;
     kbdd_poll(sc->kbd, TRUE);
 
     sp->kbd_opened = TRUE;
 over_keyboard: ;
 
     /* The screen is opened iff locking it succeeds. */
     sccnscrlock(sc, sp);
     if (!sp->kdb_locked && !sp->mtx_locked)
 	return;
     sp->scr_opened = TRUE;
 
     /* The screen switch is optional. */
     if (!(flags & 2))
 	return;
 
     /* try to switch to the kernel console screen */
     if (!cold &&
 	sc->cur_scp->index != sc_console->index &&
 	sc->cur_scp->smode.mode == VT_AUTO &&
 	sc_console->smode.mode == VT_AUTO)
 	    sc_switch_scr(sc, sc_console->index);
 }
 
 static void
 sccnclose(sc_softc_t *sc, struct sc_cnstate *sp)
 {
     sp->scr_opened = FALSE;
     sccnscrunlock(sc, sp);
 
     if (!sp->kbd_opened)
 	return;
 
     /* Restore keyboard mode (for the current, possibly-changed scp). */
     kbdd_poll(sc->kbd, FALSE);
     if (--sc->kbd_open_level == 0)
 	(void)kbdd_ioctl(sc->kbd, KDSKBMODE, (caddr_t)&sc->cur_scp->kbd_mode);
 
     kbdd_disable(sc->kbd);
     sp->kbd_opened = FALSE;
     sccnkbdunlock(sc, sp);
 }
 
 /*
  * Grabbing switches the screen and keyboard focus to sc_console and the
  * keyboard mode to (K_XLATE, polled).  Only switching to polled mode is
  * essential (for preventing the interrupt handler from eating input
  * between polls).  Focus is part of the UI, and the other switches are
  * work just was well when they are done on every entry and exit.
  *
  * Screen switches while grabbed are supported, and to maintain focus for
  * this ungrabbing and closing only restore the polling state and then
  * the keyboard mode if on the original screen.
  */
 
 static void
 sc_cngrab(struct consdev *cp)
 {
     sc_softc_t *sc;
     int lev;
 
     sc = sc_console->sc;
     lev = atomic_fetchadd_int(&sc->grab_level, 1);
     if (lev >= 0 && lev < 2) {
 	sccnopen(sc, &sc->grab_state[lev], 1 | 2);
 	sccnscrunlock(sc, &sc->grab_state[lev]);
 	sccnkbdunlock(sc, &sc->grab_state[lev]);
     }
 }
 
 static void
 sc_cnungrab(struct consdev *cp)
 {
     sc_softc_t *sc;
     int lev;
 
     sc = sc_console->sc;
     lev = atomic_load_acq_int(&sc->grab_level) - 1;
     if (lev >= 0 && lev < 2) {
 	sccnkbdlock(sc, &sc->grab_state[lev]);
 	sccnscrlock(sc, &sc->grab_state[lev]);
 	sccnclose(sc, &sc->grab_state[lev]);
     }
     atomic_add_int(&sc->grab_level, -1);
 }
 
 static char sc_cnputc_log[0x1000];
 static u_int sc_cnputc_loghead;
 static u_int sc_cnputc_logtail;
 
 static void
 sc_cnputc(struct consdev *cd, int c)
 {
     struct sc_cnstate st;
     u_char buf[1];
     scr_stat *scp = sc_console;
     void *oldts, *ts;
     struct sc_term_sw *oldtsw;
 #ifndef SC_NO_HISTORY
 #if 0
     struct tty *tp;
 #endif
 #endif /* !SC_NO_HISTORY */
     u_int head;
     int s;
 
     /* assert(sc_console != NULL) */
 
     sccnopen(scp->sc, &st, 0);
 
     /*
      * Log the output.
      *
      * In the unlocked case, the logging is intentionally only
      * perfectly atomic for the indexes.
      */
     head = atomic_fetchadd_int(&sc_cnputc_loghead, 1);
     sc_cnputc_log[head % sizeof(sc_cnputc_log)] = c;
 
     /*
      * If we couldn't open, do special reentrant output and return to defer
      * normal output.
      */
     if (!st.scr_opened) {
 	ec_putc(c);
 	return;
     }
 
 #ifndef SC_NO_HISTORY
     if (scp == scp->sc->cur_scp && scp->status & SLKED) {
 	scp->status &= ~SLKED;
 	update_kbd_state(scp, scp->status, SLKED);
 	if (scp->status & BUFFER_SAVED) {
 	    if (!sc_hist_restore(scp))
 		sc_remove_cutmarking(scp);
 	    scp->status &= ~BUFFER_SAVED;
 	    scp->status |= CURSOR_ENABLED;
 	    sc_draw_cursor_image(scp);
 	}
 #if 0
 	/*
 	 * XXX: Now that TTY's have their own locks, we cannot process
 	 * any data after disabling scroll lock. cnputs already holds a
 	 * spinlock.
 	 */
 	tp = SC_DEV(scp->sc, scp->index);
 	/* XXX "tp" can be NULL */
 	tty_lock(tp);
 	if (tty_opened(tp))
 	    sctty_outwakeup(tp);
 	tty_unlock(tp);
 #endif
     }
 #endif /* !SC_NO_HISTORY */
 
     /* Play any output still in the log (our char may already be done). */
     while (sc_cnputc_logtail != atomic_load_acq_int(&sc_cnputc_loghead)) {
 	buf[0] = sc_cnputc_log[sc_cnputc_logtail++ % sizeof(sc_cnputc_log)];
 	if (atomic_load_acq_int(&sc_cnputc_loghead) - sc_cnputc_logtail >=
 	    sizeof(sc_cnputc_log))
 	    continue;
 	/* Console output has a per-CPU "input" state.  Switch for it. */
 	oldtsw = scp->tsw;
 	oldts = scp->ts;
 	ts = sc_kts[PCPU_GET(cpuid)];
 	if (ts != NULL) {
 	    scp->tsw = sc_ktsw;
 	    scp->ts = ts;
 	    (*scp->tsw->te_sync)(scp);
 	}
 	sc_puts(scp, buf, 1);
 	scp->tsw = oldtsw;
 	scp->ts = oldts;
 	(*scp->tsw->te_sync)(scp);
     }
 
     s = spltty();	/* block sckbdevent and scrn_timer */
     sccnupdate(scp);
     splx(s);
     sccnclose(scp->sc, &st);
 }
 
 static int
 sc_cngetc(struct consdev *cd)
 {
     struct sc_cnstate st;
     int c, s;
 
     /* assert(sc_console != NULL) */
     sccnopen(sc_console->sc, &st, 1);
     s = spltty();	/* block sckbdevent and scrn_timer while we poll */
     if (!st.kbd_opened) {
 	splx(s);
 	sccnclose(sc_console->sc, &st);
 	return -1;	/* means no keyboard since we fudged the locking */
     }
     c = sc_cngetc_locked(&st);
     splx(s);
     sccnclose(sc_console->sc, &st);
     return c;
 }
 
 static int
 sc_cngetc_locked(struct sc_cnstate *sp)
 {
     static struct fkeytab fkey;
     static int fkeycp;
     scr_stat *scp;
     const u_char *p;
     int c;
 
     /* 
      * Stop the screen saver and update the screen if necessary.
      * What if we have been running in the screen saver code... XXX
      */
     if (sp->scr_opened)
 	sc_touch_scrn_saver();
     scp = sc_console->sc->cur_scp;	/* XXX */
     if (sp->scr_opened)
 	sccnupdate(scp);
 
     if (fkeycp < fkey.len)
 	return fkey.str[fkeycp++];
 
     c = scgetc(scp->sc, SCGETC_CN | SCGETC_NONBLOCK, sp);
 
     switch (KEYFLAGS(c)) {
     case 0:	/* normal char */
 	return KEYCHAR(c);
     case FKEY:	/* function key */
 	p = (*scp->tsw->te_fkeystr)(scp, c);
 	if (p != NULL) {
 	    fkey.len = strlen(p);
 	    bcopy(p, fkey.str, fkey.len);
 	    fkeycp = 1;
 	    return fkey.str[0];
 	}
 	p = kbdd_get_fkeystr(scp->sc->kbd, KEYCHAR(c), (size_t *)&fkeycp);
 	fkey.len = fkeycp;
 	if ((p != NULL) && (fkey.len > 0)) {
 	    bcopy(p, fkey.str, fkey.len);
 	    fkeycp = 1;
 	    return fkey.str[0];
 	}
 	return c;	/* XXX */
     case NOKEY:
     case ERRKEY:
     default:
 	return -1;
     }
     /* NOT REACHED */
 }
 
 static void
 sccnupdate(scr_stat *scp)
 {
     /* this is a cut-down version of scrn_timer()... */
 
     if (suspend_in_progress || scp->sc->font_loading_in_progress)
 	return;
 
     if (kdb_active || panicstr || shutdown_in_progress) {
 	sc_touch_scrn_saver();
     } else if (scp != scp->sc->cur_scp) {
 	return;
     }
 
     if (!run_scrn_saver)
 	scp->sc->flags &= ~SC_SCRN_IDLE;
 #ifdef DEV_SPLASH
     if ((saver_mode != CONS_LKM_SAVER) || !(scp->sc->flags & SC_SCRN_IDLE))
 	if (scp->sc->flags & SC_SCRN_BLANKED)
             stop_scrn_saver(scp->sc, current_saver);
 #endif
 
     if (scp != scp->sc->cur_scp || scp->sc->blink_in_progress
 	|| scp->sc->switch_in_progress)
 	return;
     /*
      * FIXME: unlike scrn_timer(), we call scrn_update() from here even
      * when write_in_progress is non-zero.  XXX
      */
 
     if (!ISGRAPHSC(scp) && !(scp->sc->flags & SC_SCRN_BLANKED))
 	scrn_update(scp, TRUE);
 }
 
 static void
 scrn_timer(void *arg)
 {
     static time_t kbd_time_stamp = 0;
     sc_softc_t *sc;
     scr_stat *scp;
     int again, rate;
 
     again = (arg != NULL);
     if (arg != NULL)
 	sc = (sc_softc_t *)arg;
     else if (sc_console != NULL)
 	sc = sc_console->sc;
     else
 	return;
 
     /* find the vty to update */
     scp = sc->cur_scp;
 
     /* don't do anything when we are performing some I/O operations */
     if (suspend_in_progress || sc->font_loading_in_progress)
 	goto done;
 
     if ((sc->kbd == NULL) && (sc->config & SC_AUTODETECT_KBD)) {
 	/* try to allocate a keyboard automatically */
 	if (kbd_time_stamp != time_uptime) {
 	    kbd_time_stamp = time_uptime;
 	    sc->keyboard = sc_allocate_keyboard(sc, -1);
 	    if (sc->keyboard >= 0) {
 		sc->kbd = kbd_get_keyboard(sc->keyboard);
 		(void)kbdd_ioctl(sc->kbd, KDSKBMODE,
 			  (caddr_t)&sc->cur_scp->kbd_mode);
 		update_kbd_state(sc->cur_scp, sc->cur_scp->status,
 				 LOCK_MASK);
 	    }
 	}
     }
 
     /* should we stop the screen saver? */
     if (kdb_active || panicstr || shutdown_in_progress)
 	sc_touch_scrn_saver();
     if (run_scrn_saver) {
 	if (time_uptime > sc->scrn_time_stamp + scrn_blank_time)
 	    sc->flags |= SC_SCRN_IDLE;
 	else
 	    sc->flags &= ~SC_SCRN_IDLE;
     } else {
 	sc->scrn_time_stamp = time_uptime;
 	sc->flags &= ~SC_SCRN_IDLE;
 	if (scrn_blank_time > 0)
 	    run_scrn_saver = TRUE;
     }
 #ifdef DEV_SPLASH
     if ((saver_mode != CONS_LKM_SAVER) || !(sc->flags & SC_SCRN_IDLE))
 	if (sc->flags & SC_SCRN_BLANKED)
             stop_scrn_saver(sc, current_saver);
 #endif
 
     /* should we just return ? */
     if (sc->blink_in_progress || sc->switch_in_progress
 	|| sc->write_in_progress)
 	goto done;
 
     /* Update the screen */
     scp = sc->cur_scp;		/* cur_scp may have changed... */
     if (!ISGRAPHSC(scp) && !(sc->flags & SC_SCRN_BLANKED))
 	scrn_update(scp, TRUE);
 
 #ifdef DEV_SPLASH
     /* should we activate the screen saver? */
     if ((saver_mode == CONS_LKM_SAVER) && (sc->flags & SC_SCRN_IDLE))
 	if (!ISGRAPHSC(scp) || (sc->flags & SC_SCRN_BLANKED))
 	    (*current_saver)(sc, TRUE);
 #endif
 
 done:
     if (again) {
 	/*
 	 * Use reduced "refresh" rate if we are in graphics and that is not a
 	 * graphical screen saver.  In such case we just have nothing to do.
 	 */
 	if (ISGRAPHSC(scp) && !(sc->flags & SC_SCRN_BLANKED))
 	    rate = 2;
 	else
 	    rate = 30;
 	callout_reset_sbt(&sc->ctimeout, SBT_1S / rate, 0,
 	    scrn_timer, sc, C_PREL(1));
     }
 }
 
 static int
 and_region(int *s1, int *e1, int s2, int e2)
 {
     if (*e1 < s2 || e2 < *s1)
 	return FALSE;
     *s1 = imax(*s1, s2);
     *e1 = imin(*e1, e2);
     return TRUE;
 }
 
 static void 
 scrn_update(scr_stat *scp, int show_cursor)
 {
     int start;
     int end;
     int s;
     int e;
 
     /* assert(scp == scp->sc->cur_scp) */
 
     SC_VIDEO_LOCK(scp->sc);
 
 #ifndef SC_NO_CUTPASTE
     /* remove the previous mouse pointer image if necessary */
     if (scp->status & MOUSE_VISIBLE) {
 	s = scp->mouse_pos;
 	e = scp->mouse_pos + scp->xsize + 1;
 	if ((scp->status & (MOUSE_MOVED | MOUSE_HIDDEN))
 	    || and_region(&s, &e, scp->start, scp->end)
 	    || ((scp->status & CURSOR_ENABLED) && 
 		(scp->cursor_pos != scp->cursor_oldpos) &&
 		(and_region(&s, &e, scp->cursor_pos, scp->cursor_pos)
 		 || and_region(&s, &e, scp->cursor_oldpos, scp->cursor_oldpos)))) {
 	    sc_remove_mouse_image(scp);
 	    if (scp->end >= scp->xsize*scp->ysize)
 		scp->end = scp->xsize*scp->ysize - 1;
 	}
     }
 #endif /* !SC_NO_CUTPASTE */
 
 #if 1
     /* debug: XXX */
     if (scp->end >= scp->xsize*scp->ysize) {
 	printf("scrn_update(): scp->end %d > size_of_screen!!\n", scp->end);
 	scp->end = scp->xsize*scp->ysize - 1;
     }
     if (scp->start < 0) {
 	printf("scrn_update(): scp->start %d < 0\n", scp->start);
 	scp->start = 0;
     }
 #endif
 
     /* update screen image */
     if (scp->start <= scp->end)  {
 	if (scp->mouse_cut_end >= 0) {
 	    /* there is a marked region for cut & paste */
 	    if (scp->mouse_cut_start <= scp->mouse_cut_end) {
 		start = scp->mouse_cut_start;
 		end = scp->mouse_cut_end;
 	    } else {
 		start = scp->mouse_cut_end;
 		end = scp->mouse_cut_start - 1;
 	    }
 	    s = start;
 	    e = end;
 	    /* does the cut-mark region overlap with the update region? */
 	    if (and_region(&s, &e, scp->start, scp->end)) {
 		(*scp->rndr->draw)(scp, s, e - s + 1, TRUE);
 		s = 0;
 		e = start - 1;
 		if (and_region(&s, &e, scp->start, scp->end))
 		    (*scp->rndr->draw)(scp, s, e - s + 1, FALSE);
 		s = end + 1;
 		e = scp->xsize*scp->ysize - 1;
 		if (and_region(&s, &e, scp->start, scp->end))
 		    (*scp->rndr->draw)(scp, s, e - s + 1, FALSE);
 	    } else {
 		(*scp->rndr->draw)(scp, scp->start,
 				   scp->end - scp->start + 1, FALSE);
 	    }
 	} else {
 	    (*scp->rndr->draw)(scp, scp->start,
 			       scp->end - scp->start + 1, FALSE);
 	}
     }
 
     /* we are not to show the cursor and the mouse pointer... */
     if (!show_cursor) {
         scp->end = 0;
         scp->start = scp->xsize*scp->ysize - 1;
 	SC_VIDEO_UNLOCK(scp->sc);
 	return;
     }
 
     /* update cursor image */
     if (scp->status & CURSOR_ENABLED) {
 	s = scp->start;
 	e = scp->end;
         /* did cursor move since last time ? */
         if (scp->cursor_pos != scp->cursor_oldpos) {
             /* do we need to remove old cursor image ? */
             if (!and_region(&s, &e, scp->cursor_oldpos, scp->cursor_oldpos))
                 sc_remove_cursor_image(scp);
             sc_draw_cursor_image(scp);
         } else {
             if (and_region(&s, &e, scp->cursor_pos, scp->cursor_pos))
 		/* cursor didn't move, but has been overwritten */
 		sc_draw_cursor_image(scp);
 	    else if (scp->curs_attr.flags & CONS_BLINK_CURSOR)
 		/* if it's a blinking cursor, update it */
 		(*scp->rndr->blink_cursor)(scp, scp->cursor_pos,
 					   sc_inside_cutmark(scp,
 					       scp->cursor_pos));
         }
     }
 
 #ifndef SC_NO_CUTPASTE
     /* update "pseudo" mouse pointer image */
     if (scp->sc->flags & SC_MOUSE_ENABLED) {
 	if (!(scp->status & (MOUSE_VISIBLE | MOUSE_HIDDEN))) {
 	    scp->status &= ~MOUSE_MOVED;
 	    sc_draw_mouse_image(scp);
 	}
     }
 #endif /* SC_NO_CUTPASTE */
 
     scp->end = 0;
     scp->start = scp->xsize*scp->ysize - 1;
 
     SC_VIDEO_UNLOCK(scp->sc);
 }
 
 #ifdef DEV_SPLASH
 static int
 scsplash_callback(int event, void *arg)
 {
     sc_softc_t *sc;
     int error;
 
     sc = (sc_softc_t *)arg;
 
     switch (event) {
     case SPLASH_INIT:
 	if (add_scrn_saver(scsplash_saver) == 0) {
 	    sc->flags &= ~SC_SAVER_FAILED;
 	    run_scrn_saver = TRUE;
 	    if (cold && !(boothowto & RB_VERBOSE)) {
 		scsplash_stick(TRUE);
 		(*current_saver)(sc, TRUE);
 	    }
 	}
 	return 0;
 
     case SPLASH_TERM:
 	if (current_saver == scsplash_saver) {
 	    scsplash_stick(FALSE);
 	    error = remove_scrn_saver(scsplash_saver);
 	    if (error)
 		return error;
 	}
 	return 0;
 
     default:
 	return EINVAL;
     }
 }
 
 static void
 scsplash_saver(sc_softc_t *sc, int show)
 {
     static int busy = FALSE;
     scr_stat *scp;
 
     if (busy)
 	return;
     busy = TRUE;
 
     scp = sc->cur_scp;
     if (show) {
 	if (!(sc->flags & SC_SAVER_FAILED)) {
 	    if (!(sc->flags & SC_SCRN_BLANKED))
 		set_scrn_saver_mode(scp, -1, NULL, 0);
 	    switch (splash(sc->adp, TRUE)) {
 	    case 0:		/* succeeded */
 		break;
 	    case EAGAIN:	/* try later */
 		restore_scrn_saver_mode(scp, FALSE);
 		sc_touch_scrn_saver();		/* XXX */
 		break;
 	    default:
 		sc->flags |= SC_SAVER_FAILED;
 		scsplash_stick(FALSE);
 		restore_scrn_saver_mode(scp, TRUE);
 		printf("scsplash_saver(): failed to put up the image\n");
 		break;
 	    }
 	}
     } else if (!sticky_splash) {
 	if ((sc->flags & SC_SCRN_BLANKED) && (splash(sc->adp, FALSE) == 0))
 	    restore_scrn_saver_mode(scp, TRUE);
     }
     busy = FALSE;
 }
 
 static int
 add_scrn_saver(void (*this_saver)(sc_softc_t *, int))
 {
 #if 0
     int error;
 
     if (current_saver != none_saver) {
 	error = remove_scrn_saver(current_saver);
 	if (error)
 	    return error;
     }
 #endif
     if (current_saver != none_saver)
 	return EBUSY;
 
     run_scrn_saver = FALSE;
     saver_mode = CONS_LKM_SAVER;
     current_saver = this_saver;
     return 0;
 }
 
 static int
 remove_scrn_saver(void (*this_saver)(sc_softc_t *, int))
 {
     if (current_saver != this_saver)
 	return EINVAL;
 
 #if 0
     /*
      * In order to prevent `current_saver' from being called by
      * the timeout routine `scrn_timer()' while we manipulate 
      * the saver list, we shall set `current_saver' to `none_saver' 
      * before stopping the current saver, rather than blocking by `splXX()'.
      */
     current_saver = none_saver;
     if (scrn_blanked)
         stop_scrn_saver(this_saver);
 #endif
 
     /* unblank all blanked screens */
     wait_scrn_saver_stop(NULL);
     if (scrn_blanked)
 	return EBUSY;
 
     current_saver = none_saver;
     return 0;
 }
 
 static int
 set_scrn_saver_mode(scr_stat *scp, int mode, u_char *pal, int border)
 {
     int s;
 
     /* assert(scp == scp->sc->cur_scp) */
     s = spltty();
     if (!ISGRAPHSC(scp))
 	sc_remove_cursor_image(scp);
     scp->splash_save_mode = scp->mode;
     scp->splash_save_status = scp->status & (GRAPHICS_MODE | PIXEL_MODE);
     scp->status &= ~(GRAPHICS_MODE | PIXEL_MODE);
     scp->status |= (UNKNOWN_MODE | SAVER_RUNNING);
     scp->sc->flags |= SC_SCRN_BLANKED;
     ++scrn_blanked;
     splx(s);
     if (mode < 0)
 	return 0;
     scp->mode = mode;
     if (set_mode(scp) == 0) {
 	if (scp->sc->adp->va_info.vi_flags & V_INFO_GRAPHICS)
 	    scp->status |= GRAPHICS_MODE;
 #ifndef SC_NO_PALETTE_LOADING
 	if (pal != NULL)
 	    vidd_load_palette(scp->sc->adp, pal);
 #endif
 	sc_set_border(scp, border);
 	return 0;
     } else {
 	s = spltty();
 	scp->mode = scp->splash_save_mode;
 	scp->status &= ~(UNKNOWN_MODE | SAVER_RUNNING);
 	scp->status |= scp->splash_save_status;
 	splx(s);
 	return 1;
     }
 }
 
 static int
 restore_scrn_saver_mode(scr_stat *scp, int changemode)
 {
     int mode;
     int status;
     int s;
 
     /* assert(scp == scp->sc->cur_scp) */
     s = spltty();
     mode = scp->mode;
     status = scp->status;
     scp->mode = scp->splash_save_mode;
     scp->status &= ~(UNKNOWN_MODE | SAVER_RUNNING);
     scp->status |= scp->splash_save_status;
     scp->sc->flags &= ~SC_SCRN_BLANKED;
     if (!changemode) {
 	if (!ISGRAPHSC(scp))
 	    sc_draw_cursor_image(scp);
 	--scrn_blanked;
 	splx(s);
 	return 0;
     }
     if (set_mode(scp) == 0) {
 #ifndef SC_NO_PALETTE_LOADING
 #ifdef SC_PIXEL_MODE
 	if (scp->sc->adp->va_info.vi_mem_model == V_INFO_MM_DIRECT)
 	    vidd_load_palette(scp->sc->adp, scp->sc->palette2);
 	else
 #endif
 	vidd_load_palette(scp->sc->adp, scp->sc->palette);
 #endif
 	--scrn_blanked;
 	splx(s);
 	return 0;
     } else {
 	scp->mode = mode;
 	scp->status = status;
 	splx(s);
 	return 1;
     }
 }
 
 static void
 stop_scrn_saver(sc_softc_t *sc, void (*saver)(sc_softc_t *, int))
 {
     (*saver)(sc, FALSE);
     run_scrn_saver = FALSE;
     /* the screen saver may have chosen not to stop after all... */
     if (sc->flags & SC_SCRN_BLANKED)
 	return;
 
     mark_all(sc->cur_scp);
     if (sc->delayed_next_scr)
 	sc_switch_scr(sc, sc->delayed_next_scr - 1);
     if (!kdb_active)
 	wakeup(&scrn_blanked);
 }
 
 static int
 wait_scrn_saver_stop(sc_softc_t *sc)
 {
     int error = 0;
 
     while (scrn_blanked > 0) {
 	run_scrn_saver = FALSE;
 	if (sc && !(sc->flags & SC_SCRN_BLANKED)) {
 	    error = 0;
 	    break;
 	}
 	error = tsleep(&scrn_blanked, PZERO | PCATCH, "scrsav", 0);
 	if ((error != 0) && (error != ERESTART))
 	    break;
     }
     run_scrn_saver = FALSE;
     return error;
 }
 #endif /* DEV_SPLASH */
 
 void
 sc_touch_scrn_saver(void)
 {
     scsplash_stick(FALSE);
     run_scrn_saver = FALSE;
 }
 
 int
 sc_switch_scr(sc_softc_t *sc, u_int next_scr)
 {
     scr_stat *cur_scp;
     struct tty *tp;
     struct proc *p;
     int s;
 
     DPRINTF(5, ("sc0: sc_switch_scr() %d ", next_scr + 1));
 
     if (sc->cur_scp == NULL)
 	return (0);
 
     /* prevent switch if previously requested */
     if (sc->flags & SC_SCRN_VTYLOCK) {
 	    sc_bell(sc->cur_scp, sc->cur_scp->bell_pitch,
 		sc->cur_scp->bell_duration);
 	    return EPERM;
     }
 
     /* delay switch if the screen is blanked or being updated */
     if ((sc->flags & SC_SCRN_BLANKED) || sc->write_in_progress
 	|| sc->blink_in_progress) {
 	sc->delayed_next_scr = next_scr + 1;
 	sc_touch_scrn_saver();
 	DPRINTF(5, ("switch delayed\n"));
 	return 0;
     }
     sc->delayed_next_scr = 0;
 
     s = spltty();
     cur_scp = sc->cur_scp;
 
     /* we are in the middle of the vty switching process... */
     if (sc->switch_in_progress
 	&& (cur_scp->smode.mode == VT_PROCESS)
 	&& cur_scp->proc) {
 	p = pfind(cur_scp->pid);
 	if (cur_scp->proc != p) {
 	    if (p)
 		PROC_UNLOCK(p);
 	    /* 
 	     * The controlling process has died!!.  Do some clean up.
 	     * NOTE:`cur_scp->proc' and `cur_scp->smode.mode' 
 	     * are not reset here yet; they will be cleared later.
 	     */
 	    DPRINTF(5, ("cur_scp controlling process %d died, ",
 	       cur_scp->pid));
 	    if (cur_scp->status & SWITCH_WAIT_REL) {
 		/*
 		 * Force the previous switch to finish, but return now 
 		 * with error.
 		 */
 		DPRINTF(5, ("reset WAIT_REL, "));
 		finish_vt_rel(cur_scp, TRUE, &s);
 		splx(s);
 		DPRINTF(5, ("finishing previous switch\n"));
 		return EINVAL;
 	    } else if (cur_scp->status & SWITCH_WAIT_ACQ) {
 		/* let's assume screen switch has been completed. */
 		DPRINTF(5, ("reset WAIT_ACQ, "));
 		finish_vt_acq(cur_scp);
 	    } else {
 		/* 
 	 	 * We are in between screen release and acquisition, and
 		 * reached here via scgetc() or scrn_timer() which has 
 		 * interrupted exchange_scr(). Don't do anything stupid.
 		 */
 		DPRINTF(5, ("waiting nothing, "));
 	    }
 	} else {
 	    if (p)
 		PROC_UNLOCK(p);
 	    /*
 	     * The controlling process is alive, but not responding... 
 	     * It is either buggy or it may be just taking time.
 	     * The following code is a gross kludge to cope with this
 	     * problem for which there is no clean solution. XXX
 	     */
 	    if (cur_scp->status & SWITCH_WAIT_REL) {
 		switch (sc->switch_in_progress++) {
 		case 1:
 		    break;
 		case 2:
 		    DPRINTF(5, ("sending relsig again, "));
 		    signal_vt_rel(cur_scp);
 		    break;
 		case 3:
 		    break;
 		case 4:
 		default:
 		    /*
 		     * Act as if the controlling program returned
 		     * VT_FALSE.
 		     */
 		    DPRINTF(5, ("force reset WAIT_REL, "));
 		    finish_vt_rel(cur_scp, FALSE, &s);
 		    splx(s);
 		    DPRINTF(5, ("act as if VT_FALSE was seen\n"));
 		    return EINVAL;
 		}
 	    } else if (cur_scp->status & SWITCH_WAIT_ACQ) {
 		switch (sc->switch_in_progress++) {
 		case 1:
 		    break;
 		case 2:
 		    DPRINTF(5, ("sending acqsig again, "));
 		    signal_vt_acq(cur_scp);
 		    break;
 		case 3:
 		    break;
 		case 4:
 		default:
 		     /* clear the flag and finish the previous switch */
 		    DPRINTF(5, ("force reset WAIT_ACQ, "));
 		    finish_vt_acq(cur_scp);
 		    break;
 		}
 	    }
 	}
     }
 
     /*
      * Return error if an invalid argument is given, or vty switch
      * is still in progress.
      */
     if ((next_scr < sc->first_vty) || (next_scr >= sc->first_vty + sc->vtys)
 	|| sc->switch_in_progress) {
 	splx(s);
 	sc_bell(cur_scp, bios_value.bell_pitch, BELL_DURATION);
 	DPRINTF(5, ("error 1\n"));
 	return EINVAL;
     }
 
     /*
      * Don't allow switching away from the graphics mode vty
      * if the switch mode is VT_AUTO, unless the next vty is the same 
      * as the current or the current vty has been closed (but showing).
      */
     tp = SC_DEV(sc, cur_scp->index);
     if ((cur_scp->index != next_scr)
 	&& tty_opened_ns(tp)
 	&& (cur_scp->smode.mode == VT_AUTO)
 	&& ISGRAPHSC(cur_scp)) {
 	splx(s);
 	sc_bell(cur_scp, bios_value.bell_pitch, BELL_DURATION);
 	DPRINTF(5, ("error, graphics mode\n"));
 	return EINVAL;
     }
 
     /*
      * Is the wanted vty open? Don't allow switching to a closed vty.
      * If we are in DDB, don't switch to a vty in the VT_PROCESS mode.
      * Note that we always allow the user to switch to the kernel 
      * console even if it is closed.
      */
     if ((sc_console == NULL) || (next_scr != sc_console->index)) {
 	tp = SC_DEV(sc, next_scr);
 	if (!tty_opened_ns(tp)) {
 	    splx(s);
 	    sc_bell(cur_scp, bios_value.bell_pitch, BELL_DURATION);
 	    DPRINTF(5, ("error 2, requested vty isn't open!\n"));
 	    return EINVAL;
 	}
 	if (kdb_active && SC_STAT(tp)->smode.mode == VT_PROCESS) {
 	    splx(s);
 	    DPRINTF(5, ("error 3, requested vty is in the VT_PROCESS mode\n"));
 	    return EINVAL;
 	}
     }
 
     /* this is the start of vty switching process... */
     ++sc->switch_in_progress;
     sc->old_scp = cur_scp;
     sc->new_scp = sc_get_stat(SC_DEV(sc, next_scr));
     if (sc->new_scp == sc->old_scp) {
 	sc->switch_in_progress = 0;
 	/*
 	 * XXX wakeup() locks the scheduler lock which will hang if
 	 * the lock is in an in-between state, e.g., when we stop at
 	 * a breakpoint at fork_exit.  It has always been wrong to call
 	 * wakeup() when the debugger is active.  In RELENG_4, wakeup()
 	 * is supposed to be locked by splhigh(), but the debugger may
 	 * be invoked at splhigh().
 	 */
 	if (!kdb_active)
 	    wakeup(VTY_WCHAN(sc,next_scr));
 	splx(s);
 	DPRINTF(5, ("switch done (new == old)\n"));
 	return 0;
     }
 
     /* has controlling process died? */
     vt_proc_alive(sc->old_scp);
     vt_proc_alive(sc->new_scp);
 
     /* wait for the controlling process to release the screen, if necessary */
     if (signal_vt_rel(sc->old_scp)) {
 	splx(s);
 	return 0;
     }
 
     /* go set up the new vty screen */
     splx(s);
     exchange_scr(sc);
     s = spltty();
 
     /* wake up processes waiting for this vty */
     if (!kdb_active)
 	wakeup(VTY_WCHAN(sc,next_scr));
 
     /* wait for the controlling process to acknowledge, if necessary */
     if (signal_vt_acq(sc->cur_scp)) {
 	splx(s);
 	return 0;
     }
 
     sc->switch_in_progress = 0;
     if (sc->unit == sc_console_unit)
 	cnavailable(sc_consptr,  TRUE);
     splx(s);
     DPRINTF(5, ("switch done\n"));
 
     return 0;
 }
 
 static int
 do_switch_scr(sc_softc_t *sc, int s)
 {
     vt_proc_alive(sc->new_scp);
 
     splx(s);
     exchange_scr(sc);
     s = spltty();
     /* sc->cur_scp == sc->new_scp */
     wakeup(VTY_WCHAN(sc,sc->cur_scp->index));
 
     /* wait for the controlling process to acknowledge, if necessary */
     if (!signal_vt_acq(sc->cur_scp)) {
 	sc->switch_in_progress = 0;
 	if (sc->unit == sc_console_unit)
 	    cnavailable(sc_consptr,  TRUE);
     }
 
     return s;
 }
 
 static int
 vt_proc_alive(scr_stat *scp)
 {
     struct proc *p;
 
     if (scp->proc) {
 	if ((p = pfind(scp->pid)) != NULL)
 	    PROC_UNLOCK(p);
 	if (scp->proc == p)
 	    return TRUE;
 	scp->proc = NULL;
 	scp->smode.mode = VT_AUTO;
 	DPRINTF(5, ("vt controlling process %d died\n", scp->pid));
     }
     return FALSE;
 }
 
 static int
 signal_vt_rel(scr_stat *scp)
 {
     if (scp->smode.mode != VT_PROCESS)
 	return FALSE;
     scp->status |= SWITCH_WAIT_REL;
     PROC_LOCK(scp->proc);
     kern_psignal(scp->proc, scp->smode.relsig);
     PROC_UNLOCK(scp->proc);
     DPRINTF(5, ("sending relsig to %d\n", scp->pid));
     return TRUE;
 }
 
 static int
 signal_vt_acq(scr_stat *scp)
 {
     if (scp->smode.mode != VT_PROCESS)
 	return FALSE;
     if (scp->sc->unit == sc_console_unit)
 	cnavailable(sc_consptr,  FALSE);
     scp->status |= SWITCH_WAIT_ACQ;
     PROC_LOCK(scp->proc);
     kern_psignal(scp->proc, scp->smode.acqsig);
     PROC_UNLOCK(scp->proc);
     DPRINTF(5, ("sending acqsig to %d\n", scp->pid));
     return TRUE;
 }
 
 static int
 finish_vt_rel(scr_stat *scp, int release, int *s)
 {
     if (scp == scp->sc->old_scp && scp->status & SWITCH_WAIT_REL) {
 	scp->status &= ~SWITCH_WAIT_REL;
 	if (release)
 	    *s = do_switch_scr(scp->sc, *s);
 	else
 	    scp->sc->switch_in_progress = 0;
 	return 0;
     }
     return EINVAL;
 }
 
 static int
 finish_vt_acq(scr_stat *scp)
 {
     if (scp == scp->sc->new_scp && scp->status & SWITCH_WAIT_ACQ) {
 	scp->status &= ~SWITCH_WAIT_ACQ;
 	scp->sc->switch_in_progress = 0;
 	return 0;
     }
     return EINVAL;
 }
 
 static void
 exchange_scr(sc_softc_t *sc)
 {
     scr_stat *scp;
 
     /* save the current state of video and keyboard */
     sc_move_cursor(sc->old_scp, sc->old_scp->xpos, sc->old_scp->ypos);
     if (!ISGRAPHSC(sc->old_scp))
 	sc_remove_cursor_image(sc->old_scp);
     if (sc->old_scp->kbd_mode == K_XLATE)
 	save_kbd_state(sc->old_scp);
 
     /* set up the video for the new screen */
     scp = sc->cur_scp = sc->new_scp;
     if (sc->old_scp->mode != scp->mode || ISUNKNOWNSC(sc->old_scp))
 	set_mode(scp);
 #ifndef __sparc64__
     else
 	sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
 		    (void *)sc->adp->va_window, FALSE);
 #endif
     scp->status |= MOUSE_HIDDEN;
     sc_move_cursor(scp, scp->xpos, scp->ypos);
     if (!ISGRAPHSC(scp))
 	sc_set_cursor_image(scp);
 #ifndef SC_NO_PALETTE_LOADING
     if (ISGRAPHSC(sc->old_scp)) {
 #ifdef SC_PIXEL_MODE
 	if (sc->adp->va_info.vi_mem_model == V_INFO_MM_DIRECT)
 	    vidd_load_palette(sc->adp, sc->palette2);
 	else
 #endif
 	vidd_load_palette(sc->adp, sc->palette);
     }
 #endif
     sc_set_border(scp, scp->border);
 
     /* set up the keyboard for the new screen */
     if (sc->kbd_open_level == 0 && sc->old_scp->kbd_mode != scp->kbd_mode)
 	(void)kbdd_ioctl(sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
     update_kbd_state(scp, scp->status, LOCK_MASK);
 
     mark_all(scp);
 }
 
 static void
 sc_puts(scr_stat *scp, u_char *buf, int len)
 {
 #ifdef DEV_SPLASH
     /* make screensaver happy */
     if (!sticky_splash && scp == scp->sc->cur_scp && !sc_saver_keyb_only)
 	run_scrn_saver = FALSE;
 #endif
 
     if (scp->tsw)
 	(*scp->tsw->te_puts)(scp, buf, len);
     if (scp->sc->delayed_next_scr)
 	sc_switch_scr(scp->sc, scp->sc->delayed_next_scr - 1);
 }
 
 void
 sc_draw_cursor_image(scr_stat *scp)
 {
     /* assert(scp == scp->sc->cur_scp); */
     SC_VIDEO_LOCK(scp->sc);
     (*scp->rndr->draw_cursor)(scp, scp->cursor_pos,
 			      scp->curs_attr.flags & CONS_BLINK_CURSOR, TRUE,
 			      sc_inside_cutmark(scp, scp->cursor_pos));
     scp->cursor_oldpos = scp->cursor_pos;
     SC_VIDEO_UNLOCK(scp->sc);
 }
 
 void
 sc_remove_cursor_image(scr_stat *scp)
 {
     /* assert(scp == scp->sc->cur_scp); */
     SC_VIDEO_LOCK(scp->sc);
     (*scp->rndr->draw_cursor)(scp, scp->cursor_oldpos,
 			      scp->curs_attr.flags & CONS_BLINK_CURSOR, FALSE,
 			      sc_inside_cutmark(scp, scp->cursor_oldpos));
     SC_VIDEO_UNLOCK(scp->sc);
 }
 
 static void
 update_cursor_image(scr_stat *scp)
 {
     /* assert(scp == scp->sc->cur_scp); */
     sc_remove_cursor_image(scp);
     sc_set_cursor_image(scp);
     sc_draw_cursor_image(scp);
 }
 
 void
 sc_set_cursor_image(scr_stat *scp)
 {
     scp->curs_attr = scp->base_curs_attr;
     if (scp->curs_attr.flags & CONS_HIDDEN_CURSOR) {
 	/* hidden cursor is internally represented as zero-height underline */
 	scp->curs_attr.flags = CONS_CHAR_CURSOR;
 	scp->curs_attr.base = scp->curs_attr.height = 0;
     } else if (scp->curs_attr.flags & CONS_CHAR_CURSOR) {
 	scp->curs_attr.base = imin(scp->base_curs_attr.base,
 				  scp->font_size - 1);
 	scp->curs_attr.height = imin(scp->base_curs_attr.height,
 				    scp->font_size - scp->curs_attr.base);
     } else {	/* block cursor */
 	scp->curs_attr.base = 0;
 	scp->curs_attr.height = scp->font_size;
     }
 
     /* assert(scp == scp->sc->cur_scp); */
     SC_VIDEO_LOCK(scp->sc);
     (*scp->rndr->set_cursor)(scp, scp->curs_attr.base, scp->curs_attr.height,
 			     scp->curs_attr.flags & CONS_BLINK_CURSOR);
     SC_VIDEO_UNLOCK(scp->sc);
 }
 
 static void
 sc_adjust_ca(struct cursor_attr *cap, int flags, int base, int height)
 {
     if (flags & CONS_CHARCURSOR_COLORS) {
 	cap->bg[0] = base & 0xff;
 	cap->bg[1] = height & 0xff;
     } else if (flags & CONS_MOUSECURSOR_COLORS) {
 	cap->mouse_ba = base & 0xff;
 	cap->mouse_ia = height & 0xff;
     } else {
 	if (base >= 0)
 	    cap->base = base;
 	if (height >= 0)
 	    cap->height = height;
 	if (!(flags & CONS_SHAPEONLY_CURSOR))
 		cap->flags = flags & CONS_CURSOR_ATTRS;
     }
 }
 
 static void
 change_cursor_shape(scr_stat *scp, int flags, int base, int height)
 {
     if ((scp == scp->sc->cur_scp) && !ISGRAPHSC(scp))
 	sc_remove_cursor_image(scp);
 
     if (flags & CONS_RESET_CURSOR)
 	scp->base_curs_attr = scp->dflt_curs_attr;
     else if (flags & CONS_DEFAULT_CURSOR) {
 	sc_adjust_ca(&scp->dflt_curs_attr, flags, base, height);
 	scp->base_curs_attr = scp->dflt_curs_attr;
     } else
 	sc_adjust_ca(&scp->base_curs_attr, flags, base, height);
 
     if ((scp == scp->sc->cur_scp) && !ISGRAPHSC(scp)) {
 	sc_set_cursor_image(scp);
 	sc_draw_cursor_image(scp);
     }
 }
 
 void
 sc_change_cursor_shape(scr_stat *scp, int flags, int base, int height)
 {
     sc_softc_t *sc;
     struct tty *tp;
     int s;
     int i;
 
     if (flags == -1)
 	flags = CONS_SHAPEONLY_CURSOR;
 
     s = spltty();
     if (flags & CONS_LOCAL_CURSOR) {
 	/* local (per vty) change */
 	change_cursor_shape(scp, flags, base, height);
 	splx(s);
 	return;
     }
 
     /* global change */
     sc = scp->sc;
     if (flags & CONS_RESET_CURSOR)
 	sc->curs_attr = sc->dflt_curs_attr;
     else if (flags & CONS_DEFAULT_CURSOR) {
 	sc_adjust_ca(&sc->dflt_curs_attr, flags, base, height);
 	sc->curs_attr = sc->dflt_curs_attr;
     } else
 	sc_adjust_ca(&sc->curs_attr, flags, base, height);
 
     for (i = sc->first_vty; i < sc->first_vty + sc->vtys; ++i) {
 	if ((tp = SC_DEV(sc, i)) == NULL)
 	    continue;
 	if ((scp = sc_get_stat(tp)) == NULL)
 	    continue;
 	scp->dflt_curs_attr = sc->curs_attr;
 	change_cursor_shape(scp, CONS_RESET_CURSOR, -1, -1);
     }
     splx(s);
 }
 
 static void
 scinit(int unit, int flags)
 {
 
     /*
      * When syscons is being initialized as the kernel console, malloc()
      * is not yet functional, because various kernel structures has not been
      * fully initialized yet.  Therefore, we need to declare the following
      * static buffers for the console.  This is less than ideal, 
      * but is necessry evil for the time being.  XXX
      */
     static u_short sc_buffer[ROW*COL];	/* XXX */
 #ifndef SC_NO_FONT_LOADING
     static u_char font_8[256*8];
     static u_char font_14[256*14];
     static u_char font_16[256*16];
 #endif
 
     sc_softc_t *sc;
     scr_stat *scp;
     video_adapter_t *adp;
     int col;
     int row;
     int i;
 
     /* one time initialization */
     if (init_done == COLD) {
 	sc_get_bios_values(&bios_value);
 	for (i = 0; i < nitems(sc_kattrtab); i++) {
 #if SC_KERNEL_CONS_ATTR == FG_WHITE
 	    sc_kattrtab[i] = 8 + (i + FG_WHITE) % 8U;
 #else
 	    sc_kattrtab[i] = SC_KERNEL_CONS_ATTR;
 #endif
 	}
     }
     init_done = WARM;
 
     /*
      * Allocate resources.  Even if we are being called for the second
      * time, we must allocate them again, because they might have 
      * disappeared...
      */
     sc = sc_get_softc(unit, flags & SC_KERNEL_CONSOLE);
     if ((sc->flags & SC_INIT_DONE) == 0)
 	SC_VIDEO_LOCKINIT(sc);
 
     adp = NULL;
     if (sc->adapter >= 0) {
 	vid_release(sc->adp, (void *)&sc->adapter);
 	adp = sc->adp;
 	sc->adp = NULL;
     }
     if (sc->keyboard >= 0) {
 	DPRINTF(5, ("sc%d: releasing kbd%d\n", unit, sc->keyboard));
 	i = kbd_release(sc->kbd, (void *)&sc->keyboard);
 	DPRINTF(5, ("sc%d: kbd_release returned %d\n", unit, i));
 	if (sc->kbd != NULL) {
 	    DPRINTF(5, ("sc%d: kbd != NULL!, index:%d, unit:%d, flags:0x%x\n",
 		unit, sc->kbd->kb_index, sc->kbd->kb_unit, sc->kbd->kb_flags));
 	}
 	sc->kbd = NULL;
     }
     sc->adapter = vid_allocate("*", unit, (void *)&sc->adapter);
     sc->adp = vid_get_adapter(sc->adapter);
     /* assert((sc->adapter >= 0) && (sc->adp != NULL)) */
 
     sc->keyboard = sc_allocate_keyboard(sc, unit);
     DPRINTF(1, ("sc%d: keyboard %d\n", unit, sc->keyboard));
 
     sc->kbd = kbd_get_keyboard(sc->keyboard);
     if (sc->kbd != NULL) {
 	DPRINTF(1, ("sc%d: kbd index:%d, unit:%d, flags:0x%x\n",
 		unit, sc->kbd->kb_index, sc->kbd->kb_unit, sc->kbd->kb_flags));
     }
 
     if (!(sc->flags & SC_INIT_DONE) || (adp != sc->adp)) {
 
 	sc->initial_mode = sc->adp->va_initial_mode;
 
 #ifndef SC_NO_FONT_LOADING
 	if (flags & SC_KERNEL_CONSOLE) {
 	    sc->font_8 = font_8;
 	    sc->font_14 = font_14;
 	    sc->font_16 = font_16;
 	} else if (sc->font_8 == NULL) {
 	    /* assert(sc_malloc) */
 	    sc->font_8 = malloc(sizeof(font_8), M_DEVBUF, M_WAITOK);
 	    sc->font_14 = malloc(sizeof(font_14), M_DEVBUF, M_WAITOK);
 	    sc->font_16 = malloc(sizeof(font_16), M_DEVBUF, M_WAITOK);
 	}
 #endif
 
 	/* extract the hardware cursor location and hide the cursor for now */
 	vidd_read_hw_cursor(sc->adp, &col, &row);
 	vidd_set_hw_cursor(sc->adp, -1, -1);
 
 	/* set up the first console */
 	sc->first_vty = unit*MAXCONS;
 	sc->vtys = MAXCONS;		/* XXX: should be configurable */
 	if (flags & SC_KERNEL_CONSOLE) {
 	    /*
 	     * Set up devs structure but don't use it yet, calling make_dev()
 	     * might panic kernel.  Wait for sc_attach_unit() to actually
 	     * create the devices.
 	     */
 	    sc->dev = main_devs;
 	    scp = &main_console;
 	    init_scp(sc, sc->first_vty, scp);
 	    sc_vtb_init(&scp->vtb, VTB_MEMORY, scp->xsize, scp->ysize,
 			(void *)sc_buffer, FALSE);
 	    if (sc_init_emulator(scp, SC_DFLT_TERM))
 		sc_init_emulator(scp, "*");
 	    (*scp->tsw->te_default_attr)(scp, SC_KERNEL_CONS_ATTR,
 					 SC_KERNEL_CONS_REV_ATTR);
 	} else {
 	    /* assert(sc_malloc) */
 	    sc->dev = malloc(sizeof(struct tty *)*sc->vtys, M_DEVBUF,
 	        M_WAITOK|M_ZERO);
 	    sc->dev[0] = sc_alloc_tty(0, unit * MAXCONS);
 	    scp = alloc_scp(sc, sc->first_vty);
 	    SC_STAT(sc->dev[0]) = scp;
 	}
 	sc->cur_scp = scp;
 
 #ifndef __sparc64__
 	/* copy screen to temporary buffer */
 	sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
 		    (void *)scp->sc->adp->va_window, FALSE);
 	if (ISTEXTSC(scp))
 	    sc_vtb_copy(&scp->scr, 0, &scp->vtb, 0, scp->xsize*scp->ysize);
 #endif
 
 	/* Sync h/w cursor position to s/w (sc and teken). */
 	if (col >= scp->xsize)
 	    col = 0;
 	if (row >= scp->ysize)
 	    row = scp->ysize - 1;
 	scp->xpos = col;
 	scp->ypos = row;
 	scp->cursor_pos = scp->cursor_oldpos = row*scp->xsize + col;
 	(*scp->tsw->te_sync)(scp);
 
 	sc->dflt_curs_attr.base = 0;
 	sc->dflt_curs_attr.height = howmany(scp->font_size, 8);
 	sc->dflt_curs_attr.flags = 0;
 	sc->dflt_curs_attr.bg[0] = FG_RED;
 	sc->dflt_curs_attr.bg[1] = FG_LIGHTGREY;
 	sc->dflt_curs_attr.bg[2] = FG_BLUE;
 	sc->dflt_curs_attr.mouse_ba = FG_WHITE;
 	sc->dflt_curs_attr.mouse_ia = FG_RED;
 	sc->curs_attr = sc->dflt_curs_attr;
 	scp->base_curs_attr = scp->dflt_curs_attr = sc->curs_attr;
 	scp->curs_attr = scp->base_curs_attr;
 
 #ifndef SC_NO_SYSMOUSE
 	sc_mouse_move(scp, scp->xpixel/2, scp->ypixel/2);
 #endif
 	if (!ISGRAPHSC(scp)) {
     	    sc_set_cursor_image(scp);
     	    sc_draw_cursor_image(scp);
 	}
 
 	/* save font and palette */
 #ifndef SC_NO_FONT_LOADING
 	sc->fonts_loaded = 0;
 	if (ISFONTAVAIL(sc->adp->va_flags)) {
 #ifdef SC_DFLT_FONT
 	    bcopy(dflt_font_8, sc->font_8, sizeof(dflt_font_8));
 	    bcopy(dflt_font_14, sc->font_14, sizeof(dflt_font_14));
 	    bcopy(dflt_font_16, sc->font_16, sizeof(dflt_font_16));
 	    sc->fonts_loaded = FONT_16 | FONT_14 | FONT_8;
 	    if (scp->font_size < 14) {
 		sc_load_font(scp, 0, 8, 8, sc->font_8, 0, 256);
 	    } else if (scp->font_size >= 16) {
 		sc_load_font(scp, 0, 16, 8, sc->font_16, 0, 256);
 	    } else {
 		sc_load_font(scp, 0, 14, 8, sc->font_14, 0, 256);
 	    }
 #else /* !SC_DFLT_FONT */
 	    if (scp->font_size < 14) {
 		sc_save_font(scp, 0, 8, 8, sc->font_8, 0, 256);
 		sc->fonts_loaded = FONT_8;
 	    } else if (scp->font_size >= 16) {
 		sc_save_font(scp, 0, 16, 8, sc->font_16, 0, 256);
 		sc->fonts_loaded = FONT_16;
 	    } else {
 		sc_save_font(scp, 0, 14, 8, sc->font_14, 0, 256);
 		sc->fonts_loaded = FONT_14;
 	    }
 #endif /* SC_DFLT_FONT */
 	    /* FONT KLUDGE: always use the font page #0. XXX */
 	    sc_show_font(scp, 0);
 	}
 #endif /* !SC_NO_FONT_LOADING */
 
 #ifndef SC_NO_PALETTE_LOADING
 	vidd_save_palette(sc->adp, sc->palette);
 #ifdef SC_PIXEL_MODE
 	for (i = 0; i < sizeof(sc->palette2); i++)
 		sc->palette2[i] = i / 3;
 #endif
 #endif
 
 #ifdef DEV_SPLASH
 	if (!(sc->flags & SC_SPLASH_SCRN)) {
 	    /* we are ready to put up the splash image! */
 	    splash_init(sc->adp, scsplash_callback, sc);
 	    sc->flags |= SC_SPLASH_SCRN;
 	}
 #endif
     }
 
     /* the rest is not necessary, if we have done it once */
     if (sc->flags & SC_INIT_DONE)
 	return;
 
     /* initialize mapscrn arrays to a one to one map */
     for (i = 0; i < sizeof(sc->scr_map); i++)
 	sc->scr_map[i] = sc->scr_rmap[i] = i;
 
     sc->flags |= SC_INIT_DONE;
 }
 
 static void
 scterm(int unit, int flags)
 {
     sc_softc_t *sc;
     scr_stat *scp;
 
     sc = sc_get_softc(unit, flags & SC_KERNEL_CONSOLE);
     if (sc == NULL)
 	return;			/* shouldn't happen */
 
 #ifdef DEV_SPLASH
     /* this console is no longer available for the splash screen */
     if (sc->flags & SC_SPLASH_SCRN) {
 	splash_term(sc->adp);
 	sc->flags &= ~SC_SPLASH_SCRN;
     }
 #endif
 
 #if 0 /* XXX */
     /* move the hardware cursor to the upper-left corner */
     vidd_set_hw_cursor(sc->adp, 0, 0);
 #endif
 
     /* release the keyboard and the video card */
     if (sc->keyboard >= 0)
 	kbd_release(sc->kbd, &sc->keyboard);
     if (sc->adapter >= 0)
 	vid_release(sc->adp, &sc->adapter);
 
     /* stop the terminal emulator, if any */
     scp = sc_get_stat(sc->dev[0]);
     if (scp->tsw)
 	(*scp->tsw->te_term)(scp, &scp->ts);
     mtx_destroy(&sc->video_mtx);
 
     /* clear the structure */
     if (!(flags & SC_KERNEL_CONSOLE)) {
 	free(scp->ts, M_DEVBUF);
 	/* XXX: We need delete_dev() for this */
 	free(sc->dev, M_DEVBUF);
 #if 0
 	/* XXX: We need a ttyunregister for this */
 	free(sc->tty, M_DEVBUF);
 #endif
 #ifndef SC_NO_FONT_LOADING
 	free(sc->font_8, M_DEVBUF);
 	free(sc->font_14, M_DEVBUF);
 	free(sc->font_16, M_DEVBUF);
 #endif
 	/* XXX vtb, history */
     }
     bzero(sc, sizeof(*sc));
     sc->keyboard = -1;
     sc->adapter = -1;
 }
 
 static void
 scshutdown(__unused void *arg, __unused int howto)
 {
 
 	KASSERT(sc_console != NULL, ("sc_console != NULL"));
 	KASSERT(sc_console->sc != NULL, ("sc_console->sc != NULL"));
 	KASSERT(sc_console->sc->cur_scp != NULL,
 	    ("sc_console->sc->cur_scp != NULL"));
 
 	sc_touch_scrn_saver();
 	if (!cold &&
 	    sc_console->sc->cur_scp->index != sc_console->index &&
 	    sc_console->sc->cur_scp->smode.mode == VT_AUTO &&
 	    sc_console->smode.mode == VT_AUTO)
 		sc_switch_scr(sc_console->sc, sc_console->index);
 	shutdown_in_progress = TRUE;
 }
 
 static void
 scsuspend(__unused void *arg)
 {
 	int retry;
 
 	KASSERT(sc_console != NULL, ("sc_console != NULL"));
 	KASSERT(sc_console->sc != NULL, ("sc_console->sc != NULL"));
 	KASSERT(sc_console->sc->cur_scp != NULL,
 	    ("sc_console->sc->cur_scp != NULL"));
 
 	sc_susp_scr = sc_console->sc->cur_scp->index;
 	if (sc_no_suspend_vtswitch ||
 	    sc_susp_scr == sc_console->index) {
 		sc_touch_scrn_saver();
 		sc_susp_scr = -1;
 		return;
 	}
 	for (retry = 0; retry < 10; retry++) {
 		sc_switch_scr(sc_console->sc, sc_console->index);
 		if (!sc_console->sc->switch_in_progress)
 			break;
 		pause("scsuspend", hz);
 	}
 	suspend_in_progress = TRUE;
 }
 
 static void
 scresume(__unused void *arg)
 {
 
 	KASSERT(sc_console != NULL, ("sc_console != NULL"));
 	KASSERT(sc_console->sc != NULL, ("sc_console->sc != NULL"));
 	KASSERT(sc_console->sc->cur_scp != NULL,
 	    ("sc_console->sc->cur_scp != NULL"));
 
 	suspend_in_progress = FALSE;
 	if (sc_susp_scr < 0) {
 		update_font(sc_console->sc->cur_scp);
 		return;
 	}
 	sc_switch_scr(sc_console->sc, sc_susp_scr);
 }
 
 int
 sc_clean_up(scr_stat *scp)
 {
 #ifdef DEV_SPLASH
     int error;
 #endif
 
     if (scp->sc->flags & SC_SCRN_BLANKED) {
 	sc_touch_scrn_saver();
 #ifdef DEV_SPLASH
 	if ((error = wait_scrn_saver_stop(scp->sc)))
 	    return error;
 #endif
     }
     scp->status |= MOUSE_HIDDEN;
     sc_remove_mouse_image(scp);
     sc_remove_cutmarking(scp);
     return 0;
 }
 
 void
 sc_alloc_scr_buffer(scr_stat *scp, int wait, int discard)
 {
     sc_vtb_t new;
     sc_vtb_t old;
 
     old = scp->vtb;
     sc_vtb_init(&new, VTB_MEMORY, scp->xsize, scp->ysize, NULL, wait);
     if (!discard && (old.vtb_flags & VTB_VALID)) {
 	/* retain the current cursor position and buffer contants */
 	scp->cursor_oldpos = scp->cursor_pos;
 	/* 
 	 * This works only if the old buffer has the same size as or larger 
 	 * than the new one. XXX
 	 */
 	sc_vtb_copy(&old, 0, &new, 0, scp->xsize*scp->ysize);
 	scp->vtb = new;
     } else {
 	scp->vtb = new;
 	sc_vtb_destroy(&old);
     }
 
 #ifndef SC_NO_SYSMOUSE
     /* move the mouse cursor at the center of the screen */
     sc_mouse_move(scp, scp->xpixel / 2, scp->ypixel / 2);
 #endif
 }
 
 static scr_stat
 *alloc_scp(sc_softc_t *sc, int vty)
 {
     scr_stat *scp;
 
     /* assert(sc_malloc) */
 
     scp = (scr_stat *)malloc(sizeof(scr_stat), M_DEVBUF, M_WAITOK);
     init_scp(sc, vty, scp);
 
     sc_alloc_scr_buffer(scp, TRUE, TRUE);
     if (sc_init_emulator(scp, SC_DFLT_TERM))
 	sc_init_emulator(scp, "*");
 
 #ifndef SC_NO_CUTPASTE
     sc_alloc_cut_buffer(scp, TRUE);
 #endif
 
 #ifndef SC_NO_HISTORY
     sc_alloc_history_buffer(scp, 0, 0, TRUE);
 #endif
 
     return scp;
 }
 
 static void
 init_scp(sc_softc_t *sc, int vty, scr_stat *scp)
 {
     video_info_t info;
 
     bzero(scp, sizeof(*scp));
 
     scp->index = vty;
     scp->sc = sc;
     scp->status = 0;
     scp->mode = sc->initial_mode;
     vidd_get_info(sc->adp, scp->mode, &info);
     if (info.vi_flags & V_INFO_GRAPHICS) {
 	scp->status |= GRAPHICS_MODE;
 	scp->xpixel = info.vi_width;
 	scp->ypixel = info.vi_height;
 	scp->xsize = info.vi_width/info.vi_cwidth;
 	scp->ysize = info.vi_height/info.vi_cheight;
 	scp->font_size = 0;
 	scp->font = NULL;
     } else {
 	scp->xsize = info.vi_width;
 	scp->ysize = info.vi_height;
 	scp->xpixel = scp->xsize*info.vi_cwidth;
 	scp->ypixel = scp->ysize*info.vi_cheight;
     }
 
     scp->font_size = info.vi_cheight;
     scp->font_width = info.vi_cwidth;
 #ifndef SC_NO_FONT_LOADING
     if (info.vi_cheight < 14)
 	scp->font = sc->font_8;
     else if (info.vi_cheight >= 16)
 	scp->font = sc->font_16;
     else
 	scp->font = sc->font_14;
 #else
     scp->font = NULL;
 #endif
 
     sc_vtb_init(&scp->vtb, VTB_MEMORY, 0, 0, NULL, FALSE);
 #ifndef __sparc64__
     sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, 0, 0, NULL, FALSE);
 #endif
     scp->xoff = scp->yoff = 0;
     scp->xpos = scp->ypos = 0;
     scp->start = scp->xsize * scp->ysize - 1;
     scp->end = 0;
     scp->tsw = NULL;
     scp->ts = NULL;
     scp->rndr = NULL;
     scp->border = (SC_NORM_ATTR >> 4) & 0x0f;
     scp->base_curs_attr = scp->dflt_curs_attr = sc->curs_attr;
     scp->mouse_cut_start = scp->xsize*scp->ysize;
     scp->mouse_cut_end = -1;
     scp->mouse_signal = 0;
     scp->mouse_pid = 0;
     scp->mouse_proc = NULL;
     scp->kbd_mode = K_XLATE;
     scp->bell_pitch = bios_value.bell_pitch;
     scp->bell_duration = BELL_DURATION;
     scp->status |= (bios_value.shift_state & NLKED);
     scp->status |= CURSOR_ENABLED | MOUSE_HIDDEN;
     scp->pid = 0;
     scp->proc = NULL;
     scp->smode.mode = VT_AUTO;
     scp->history = NULL;
     scp->history_pos = 0;
     scp->history_size = 0;
 }
 
 int
 sc_init_emulator(scr_stat *scp, char *name)
 {
     sc_term_sw_t *sw;
     sc_rndr_sw_t *rndr;
     void *p;
     int error;
 
     if (name == NULL)	/* if no name is given, use the current emulator */
 	sw = scp->tsw;
     else		/* ...otherwise find the named emulator */
 	sw = sc_term_match(name);
     if (sw == NULL)
 	return EINVAL;
 
     rndr = NULL;
     if (strcmp(sw->te_renderer, "*") != 0) {
 	rndr = sc_render_match(scp, sw->te_renderer,
 			       scp->status & (GRAPHICS_MODE | PIXEL_MODE));
     }
     if (rndr == NULL) {
 	rndr = sc_render_match(scp, scp->sc->adp->va_name,
 			       scp->status & (GRAPHICS_MODE | PIXEL_MODE));
 	if (rndr == NULL)
 	    return ENODEV;
     }
 
     if (sw == scp->tsw) {
 	error = (*sw->te_init)(scp, &scp->ts, SC_TE_WARM_INIT);
 	scp->rndr = rndr;
 	scp->rndr->init(scp);
 	sc_clear_screen(scp);
 	/* assert(error == 0); */
 	return error;
     }
 
     if (sc_malloc && (sw->te_size > 0))
 	p = malloc(sw->te_size, M_DEVBUF, M_NOWAIT);
     else
 	p = NULL;
     error = (*sw->te_init)(scp, &p, SC_TE_COLD_INIT);
     if (error)
 	return error;
 
     if (scp->tsw)
 	(*scp->tsw->te_term)(scp, &scp->ts);
     if (scp->ts != NULL)
 	free(scp->ts, M_DEVBUF);
     scp->tsw = sw;
     scp->ts = p;
     scp->rndr = rndr;
     scp->rndr->init(scp);
 
     (*sw->te_default_attr)(scp, SC_NORM_ATTR, SC_NORM_REV_ATTR);
     sc_clear_screen(scp);
 
     return 0;
 }
 
 /*
  * scgetc(flags) - get character from keyboard.
  * If flags & SCGETC_CN, then avoid harmful side effects.
  * If flags & SCGETC_NONBLOCK, then wait until a key is pressed, else
  * return NOKEY if there is nothing there.
  */
 static u_int
 scgetc(sc_softc_t *sc, u_int flags, struct sc_cnstate *sp)
 {
     scr_stat *scp;
 #ifndef SC_NO_HISTORY
     struct tty *tp;
 #endif
     u_int c;
     int this_scr;
     int f;
     int i;
 
     if (sc->kbd == NULL)
 	return NOKEY;
 
 next_code:
 #if 1
     /* I don't like this, but... XXX */
     if (flags & SCGETC_CN)
 	sccnupdate(sc->cur_scp);
 #endif
     scp = sc->cur_scp;
     /* first see if there is something in the keyboard port */
     for (;;) {
 	if (flags & SCGETC_CN)
 	    sccnscrunlock(sc, sp);
 	c = kbdd_read_char(sc->kbd, !(flags & SCGETC_NONBLOCK));
 	if (flags & SCGETC_CN)
 	    sccnscrlock(sc, sp);
 	if (c == ERRKEY) {
 	    if (!(flags & SCGETC_CN))
 		sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
 	} else if (c == NOKEY)
 	    return c;
 	else
 	    break;
     }
 
     /* make screensaver happy */
     if (!(c & RELKEY))
 	sc_touch_scrn_saver();
 
     if (!(flags & SCGETC_CN))
 	random_harvest_queue(&c, sizeof(c), 1, RANDOM_KEYBOARD);
 
     if (sc->kbd_open_level == 0 && scp->kbd_mode != K_XLATE)
 	return KEYCHAR(c);
 
     /* if scroll-lock pressed allow history browsing */
     if (!ISGRAPHSC(scp) && scp->history && scp->status & SLKED) {
 
 	scp->status &= ~CURSOR_ENABLED;
 	sc_remove_cursor_image(scp);
 
 #ifndef SC_NO_HISTORY
 	if (!(scp->status & BUFFER_SAVED)) {
 	    scp->status |= BUFFER_SAVED;
 	    sc_hist_save(scp);
 	}
 	switch (c) {
 	/* FIXME: key codes */
 	case SPCLKEY | FKEY | F(49):  /* home key */
 	    sc_remove_cutmarking(scp);
 	    sc_hist_home(scp);
 	    goto next_code;
 
 	case SPCLKEY | FKEY | F(57):  /* end key */
 	    sc_remove_cutmarking(scp);
 	    sc_hist_end(scp);
 	    goto next_code;
 
 	case SPCLKEY | FKEY | F(50):  /* up arrow key */
 	    sc_remove_cutmarking(scp);
 	    if (sc_hist_up_line(scp))
 		if (!(flags & SCGETC_CN))
 		    sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
 	    goto next_code;
 
 	case SPCLKEY | FKEY | F(58):  /* down arrow key */
 	    sc_remove_cutmarking(scp);
 	    if (sc_hist_down_line(scp))
 		if (!(flags & SCGETC_CN))
 		    sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
 	    goto next_code;
 
 	case SPCLKEY | FKEY | F(51):  /* page up key */
 	    sc_remove_cutmarking(scp);
 	    for (i=0; i<scp->ysize; i++)
 	    if (sc_hist_up_line(scp)) {
 		if (!(flags & SCGETC_CN))
 		    sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
 		break;
 	    }
 	    goto next_code;
 
 	case SPCLKEY | FKEY | F(59):  /* page down key */
 	    sc_remove_cutmarking(scp);
 	    for (i=0; i<scp->ysize; i++)
 	    if (sc_hist_down_line(scp)) {
 		if (!(flags & SCGETC_CN))
 		    sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
 		break;
 	    }
 	    goto next_code;
 	}
 #endif /* SC_NO_HISTORY */
     }
 
     /* 
      * Process and consume special keys here.  Return a plain char code
      * or a char code with the META flag or a function key code.
      */
     if (c & RELKEY) {
 	/* key released */
 	/* goto next_code */
     } else {
 	/* key pressed */
 	if (c & SPCLKEY) {
 	    c &= ~SPCLKEY;
 	    switch (KEYCHAR(c)) {
 	    /* LOCKING KEYS */
 	    case NLK: case CLK: case ALK:
 		break;
 	    case SLK:
 		(void)kbdd_ioctl(sc->kbd, KDGKBSTATE, (caddr_t)&f);
 		if (f & SLKED) {
 		    scp->status |= SLKED;
 		} else {
 		    if (scp->status & SLKED) {
 			scp->status &= ~SLKED;
 #ifndef SC_NO_HISTORY
 			if (scp->status & BUFFER_SAVED) {
 			    if (!sc_hist_restore(scp))
 				sc_remove_cutmarking(scp);
 			    scp->status &= ~BUFFER_SAVED;
 			    scp->status |= CURSOR_ENABLED;
 			    sc_draw_cursor_image(scp);
 			}
 			/* Only safe in Giant-locked context. */
 			tp = SC_DEV(sc, scp->index);
 			if (!(flags & SCGETC_CN) && tty_opened_ns(tp))
 			    sctty_outwakeup(tp);
 #endif
 		    }
 		}
 		break;
 
 	    case PASTE:
 #ifndef SC_NO_CUTPASTE
 		sc_mouse_paste(scp);
 #endif
 		break;
 
 	    /* NON-LOCKING KEYS */
 	    case NOP:
 	    case LSH:  case RSH:  case LCTR: case RCTR:
 	    case LALT: case RALT: case ASH:  case META:
 		break;
 
 	    case BTAB:
 		if (!(sc->flags & SC_SCRN_BLANKED))
 		    return c;
 		break;
 
 	    case SPSC:
 #ifdef DEV_SPLASH
 		/* force activatation/deactivation of the screen saver */
 		if (!(sc->flags & SC_SCRN_BLANKED)) {
 		    run_scrn_saver = TRUE;
 		    sc->scrn_time_stamp -= scrn_blank_time;
 		}
 		if (cold) {
 		    /*
 		     * While devices are being probed, the screen saver need
 		     * to be invoked explicitly. XXX
 		     */
 		    if (sc->flags & SC_SCRN_BLANKED) {
 			scsplash_stick(FALSE);
 			stop_scrn_saver(sc, current_saver);
 		    } else {
 			if (!ISGRAPHSC(scp)) {
 			    scsplash_stick(TRUE);
 			    (*current_saver)(sc, TRUE);
 			}
 		    }
 		}
 #endif /* DEV_SPLASH */
 		break;
 
 	    case RBT:
 #ifndef SC_DISABLE_REBOOT
 		if (enable_reboot && !(flags & SCGETC_CN))
 			shutdown_nice(0);
 #endif
 		break;
 
 	    case HALT:
 #ifndef SC_DISABLE_REBOOT
 		if (enable_reboot && !(flags & SCGETC_CN))
 			shutdown_nice(RB_HALT);
 #endif
 		break;
 
 	    case PDWN:
 #ifndef SC_DISABLE_REBOOT
 		if (enable_reboot && !(flags & SCGETC_CN))
 			shutdown_nice(RB_HALT|RB_POWEROFF);
 #endif
 		break;
 
 	    case SUSP:
 		power_pm_suspend(POWER_SLEEP_STATE_SUSPEND);
 		break;
 	    case STBY:
 		power_pm_suspend(POWER_SLEEP_STATE_STANDBY);
 		break;
 
 	    case DBG:
 #ifndef SC_DISABLE_KDBKEY
 		if (enable_kdbkey)
 			kdb_break();
 #endif
 		break;
 
 	    case PNC:
 		if (enable_panic_key)
 			panic("Forced by the panic key");
 		break;
 
 	    case NEXT:
 		this_scr = scp->index;
 		for (i = (this_scr - sc->first_vty + 1)%sc->vtys;
 			sc->first_vty + i != this_scr; 
 			i = (i + 1)%sc->vtys) {
 		    struct tty *tp = SC_DEV(sc, sc->first_vty + i);
 		    if (tty_opened_ns(tp)) {
 			sc_switch_scr(scp->sc, sc->first_vty + i);
 			break;
 		    }
 		}
 		break;
 
 	    case PREV:
 		this_scr = scp->index;
 		for (i = (this_scr - sc->first_vty + sc->vtys - 1)%sc->vtys;
 			sc->first_vty + i != this_scr;
 			i = (i + sc->vtys - 1)%sc->vtys) {
 		    struct tty *tp = SC_DEV(sc, sc->first_vty + i);
 		    if (tty_opened_ns(tp)) {
 			sc_switch_scr(scp->sc, sc->first_vty + i);
 			break;
 		    }
 		}
 		break;
 
 	    default:
 		if (KEYCHAR(c) >= F_SCR && KEYCHAR(c) <= L_SCR) {
 		    sc_switch_scr(scp->sc, sc->first_vty + KEYCHAR(c) - F_SCR);
 		    break;
 		}
 		/* assert(c & FKEY) */
 		if (!(sc->flags & SC_SCRN_BLANKED))
 		    return c;
 		break;
 	    }
 	    /* goto next_code */
 	} else {
 	    /* regular keys (maybe MKEY is set) */
 #if !defined(SC_DISABLE_KDBKEY) && defined(KDB)
 	    if (enable_kdbkey)
 		kdb_alt_break(c, &sc->sc_altbrk);
 #endif
 	    if (!(sc->flags & SC_SCRN_BLANKED))
 		return c;
 	}
     }
 
     goto next_code;
 }
 
 static int
 sctty_mmap(struct tty *tp, vm_ooffset_t offset, vm_paddr_t *paddr,
     int nprot, vm_memattr_t *memattr)
 {
     scr_stat *scp;
 
     scp = sc_get_stat(tp);
     if (scp != scp->sc->cur_scp)
 	return -1;
     return vidd_mmap(scp->sc->adp, offset, paddr, nprot, memattr);
 }
 
 static void
 update_font(scr_stat *scp)
 {
 #ifndef SC_NO_FONT_LOADING
     /* load appropriate font */
     if (!(scp->status & GRAPHICS_MODE)) {
 	if (!(scp->status & PIXEL_MODE) && ISFONTAVAIL(scp->sc->adp->va_flags)) {
 	    if (scp->font_size < 14) {
 		if (scp->sc->fonts_loaded & FONT_8)
 		    sc_load_font(scp, 0, 8, 8, scp->sc->font_8, 0, 256);
 	    } else if (scp->font_size >= 16) {
 		if (scp->sc->fonts_loaded & FONT_16)
 		    sc_load_font(scp, 0, 16, 8, scp->sc->font_16, 0, 256);
 	    } else {
 		if (scp->sc->fonts_loaded & FONT_14)
 		    sc_load_font(scp, 0, 14, 8, scp->sc->font_14, 0, 256);
 	    }
 	    /*
 	     * FONT KLUDGE:
 	     * This is an interim kludge to display correct font.
 	     * Always use the font page #0 on the video plane 2.
 	     * Somehow we cannot show the font in other font pages on
 	     * some video cards... XXX
 	     */ 
 	    sc_show_font(scp, 0);
 	}
 	mark_all(scp);
     }
 #endif /* !SC_NO_FONT_LOADING */
 }
 
 static int
 save_kbd_state(scr_stat *scp)
 {
     int state;
     int error;
 
     error = kbdd_ioctl(scp->sc->kbd, KDGKBSTATE, (caddr_t)&state);
     if (error == ENOIOCTL)
 	error = ENODEV;
     if (error == 0) {
 	scp->status &= ~LOCK_MASK;
 	scp->status |= state;
     }
     return error;
 }
 
 static int
 update_kbd_state(scr_stat *scp, int new_bits, int mask)
 {
     int state;
     int error;
 
     if (mask != LOCK_MASK) {
 	error = kbdd_ioctl(scp->sc->kbd, KDGKBSTATE, (caddr_t)&state);
 	if (error == ENOIOCTL)
 	    error = ENODEV;
 	if (error)
 	    return error;
 	state &= ~mask;
 	state |= new_bits & mask;
     } else {
 	state = new_bits & LOCK_MASK;
     }
     error = kbdd_ioctl(scp->sc->kbd, KDSKBSTATE, (caddr_t)&state);
     if (error == ENOIOCTL)
 	error = ENODEV;
     return error;
 }
 
 static int
 update_kbd_leds(scr_stat *scp, int which)
 {
     int error;
 
     which &= LOCK_MASK;
     error = kbdd_ioctl(scp->sc->kbd, KDSETLED, (caddr_t)&which);
     if (error == ENOIOCTL)
 	error = ENODEV;
     return error;
 }
 
 int
 set_mode(scr_stat *scp)
 {
     video_info_t info;
 
     /* reject unsupported mode */
     if (vidd_get_info(scp->sc->adp, scp->mode, &info))
 	return 1;
 
     /* if this vty is not currently showing, do nothing */
     if (scp != scp->sc->cur_scp)
 	return 0;
 
     /* setup video hardware for the given mode */
     vidd_set_mode(scp->sc->adp, scp->mode);
     scp->rndr->init(scp);
 #ifndef __sparc64__
     sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
 		(void *)scp->sc->adp->va_window, FALSE);
 #endif
 
     update_font(scp);
 
     sc_set_border(scp, scp->border);
     sc_set_cursor_image(scp);
 
     return 0;
 }
 
 void
 sc_set_border(scr_stat *scp, int color)
 {
     SC_VIDEO_LOCK(scp->sc);
     (*scp->rndr->draw_border)(scp, color);
     SC_VIDEO_UNLOCK(scp->sc);
 }
 
 #ifndef SC_NO_FONT_LOADING
 void
 sc_load_font(scr_stat *scp, int page, int size, int width, u_char *buf,
 	     int base, int count)
 {
     sc_softc_t *sc;
 
     sc = scp->sc;
     sc->font_loading_in_progress = TRUE;
     vidd_load_font(sc->adp, page, size, width, buf, base, count);
     sc->font_loading_in_progress = FALSE;
 }
 
 void
 sc_save_font(scr_stat *scp, int page, int size, int width, u_char *buf,
 	     int base, int count)
 {
     sc_softc_t *sc;
 
     sc = scp->sc;
     sc->font_loading_in_progress = TRUE;
     vidd_save_font(sc->adp, page, size, width, buf, base, count);
     sc->font_loading_in_progress = FALSE;
 }
 
 void
 sc_show_font(scr_stat *scp, int page)
 {
     vidd_show_font(scp->sc->adp, page);
 }
 #endif /* !SC_NO_FONT_LOADING */
 
 void
 sc_paste(scr_stat *scp, const u_char *p, int count) 
 {
     struct tty *tp;
     u_char *rmap;
 
     tp = SC_DEV(scp->sc, scp->sc->cur_scp->index);
     if (!tty_opened_ns(tp))
 	return;
     rmap = scp->sc->scr_rmap;
     for (; count > 0; --count)
 	ttydisc_rint(tp, rmap[*p++], 0);
     ttydisc_rint_done(tp);
 }
 
 void
 sc_respond(scr_stat *scp, const u_char *p, int count, int wakeup) 
 {
     struct tty *tp;
 
     tp = SC_DEV(scp->sc, scp->sc->cur_scp->index);
     if (!tty_opened_ns(tp))
 	return;
     ttydisc_rint_simple(tp, p, count);
     if (wakeup) {
 	/* XXX: we can't always call ttydisc_rint_done() here! */
 	ttydisc_rint_done(tp);
     }
 }
 
 void
 sc_bell(scr_stat *scp, int pitch, int duration)
 {
     if (cold || kdb_active || shutdown_in_progress || !enable_bell)
 	return;
 
     if (scp != scp->sc->cur_scp && (scp->sc->flags & SC_QUIET_BELL))
 	return;
 
     if (scp->sc->flags & SC_VISUAL_BELL) {
 	if (scp->sc->blink_in_progress)
 	    return;
 	scp->sc->blink_in_progress = 3;
 	if (scp != scp->sc->cur_scp)
 	    scp->sc->blink_in_progress += 2;
 	blink_screen(scp->sc->cur_scp);
     } else if (duration != 0 && pitch != 0) {
 	if (scp != scp->sc->cur_scp)
 	    pitch *= 2;
 	sysbeep(1193182 / pitch, duration);
     }
 }
 
 static int
 sc_kattr(void)
 {
     if (sc_console == NULL)
 	return (SC_KERNEL_CONS_ATTR);	/* for very early, before pcpu */
     return (sc_kattrtab[PCPU_GET(cpuid) % nitems(sc_kattrtab)]);
 }
 
 static void
 blink_screen(void *arg)
 {
     scr_stat *scp = arg;
     struct tty *tp;
 
     if (ISGRAPHSC(scp) || (scp->sc->blink_in_progress <= 1)) {
 	scp->sc->blink_in_progress = 0;
     	mark_all(scp);
 	tp = SC_DEV(scp->sc, scp->index);
 	if (tty_opened_ns(tp))
 	    sctty_outwakeup(tp);
 	if (scp->sc->delayed_next_scr)
 	    sc_switch_scr(scp->sc, scp->sc->delayed_next_scr - 1);
     }
     else {
 	(*scp->rndr->draw)(scp, 0, scp->xsize*scp->ysize, 
 			   scp->sc->blink_in_progress & 1);
 	scp->sc->blink_in_progress--;
 	callout_reset_sbt(&scp->sc->cblink, SBT_1S / 15, 0,
 	    blink_screen, scp, C_PREL(0));
     }
 }
 
 /*
  * Until sc_attach_unit() gets called no dev structures will be available
  * to store the per-screen current status.  This is the case when the
  * kernel is initially booting and needs access to its console.  During
  * this early phase of booting the console's current status is kept in
  * one statically defined scr_stat structure, and any pointers to the
  * dev structures will be NULL.
  */
 
 static scr_stat *
 sc_get_stat(struct tty *tp)
 {
 	if (tp == NULL)
 		return (&main_console);
 	return (SC_STAT(tp));
 }
 
 /*
  * Allocate active keyboard. Try to allocate "kbdmux" keyboard first, and,
  * if found, add all non-busy keyboards to "kbdmux". Otherwise look for
  * any keyboard.
  */
 
 static int
 sc_allocate_keyboard(sc_softc_t *sc, int unit)
 {
 	int		 idx0, idx;
 	keyboard_t	*k0, *k;
 	keyboard_info_t	 ki;
 
 	idx0 = kbd_allocate("kbdmux", -1, (void *)&sc->keyboard, sckbdevent, sc);
 	if (idx0 != -1) {
 		k0 = kbd_get_keyboard(idx0);
 
 		for (idx = kbd_find_keyboard2("*", -1, 0);
 		     idx != -1;
 		     idx = kbd_find_keyboard2("*", -1, idx + 1)) {
 			k = kbd_get_keyboard(idx);
 
 			if (idx == idx0 || KBD_IS_BUSY(k))
 				continue;
 
 			bzero(&ki, sizeof(ki));
 			strcpy(ki.kb_name, k->kb_name);
 			ki.kb_unit = k->kb_unit;
 
 			(void)kbdd_ioctl(k0, KBADDKBD, (caddr_t) &ki);
 		}
 	} else
 		idx0 = kbd_allocate("*", unit, (void *)&sc->keyboard, sckbdevent, sc);
 
 	return (idx0);
 }
Index: head/sys/i386/conf/NOTES
===================================================================
--- head/sys/i386/conf/NOTES	(revision 332488)
+++ head/sys/i386/conf/NOTES	(revision 332489)
@@ -1,1000 +1,971 @@
 #
 # NOTES -- Lines that can be cut/pasted into kernel and hints configs.
 #
 # This file contains machine dependent kernel configuration notes.  For
 # machine independent notes, look in /sys/conf/NOTES.
 #
 # $FreeBSD$
 #
 
 #
 # We want LINT to cover profiling as well.
 profile         2
 
 #
 # Enable the kernel DTrace hooks which are required to load the DTrace
 # kernel modules.
 #
 options 	KDTRACE_HOOKS
 
 # DTrace core
 # NOTE: introduces CDDL-licensed components into the kernel
 #device		dtrace
 
 # DTrace modules
 #device		dtrace_profile
 #device		dtrace_sdt
 #device		dtrace_fbt
 #device		dtrace_systrace
 #device		dtrace_prototype
 #device		dtnfscl
 #device		dtmalloc
 
 # Alternatively include all the DTrace modules
 #device		dtraceall
 
 
 #####################################################################
 # SMP OPTIONS:
 #
 # The apic device enables the use of the I/O APIC for interrupt delivery.
 # The apic device can be used in both UP and SMP kernels, but is required
 # for SMP kernels.  Thus, the apic device is not strictly an SMP option,
 # but it is a prerequisite for SMP.
 #
 # Notes:
 #
 # HTT CPUs should only be used if they are enabled in the BIOS.  For
 # the ACPI case, ACPI only correctly tells us about any HTT CPUs if
 # they are enabled.  However, most HTT systems do not list HTT CPUs
 # in the MP Table if they are enabled, thus we guess at the HTT CPUs
 # for the MP Table case.  However, we shouldn't try to guess and use
 # these CPUs if HTT is disabled.  Thus, HTT guessing is only enabled
 # for the MP Table if the user explicitly asks for it via the
 # MPTABLE_FORCE_HTT option.  Do NOT use this option if you have HTT
 # disabled in your BIOS.
 #
 # IPI_PREEMPTION instructs the kernel to preempt threads running on other
 # CPUS if needed.  Relies on the PREEMPTION option
 
 # Mandatory:
 device		apic			# I/O apic
 
 # Optional:
 options 	MPTABLE_FORCE_HTT	# Enable HTT CPUs with the MP Table
 options 	IPI_PREEMPTION
 
 #
 # Watchdog routines.
 #
 options 	MP_WATCHDOG
 
 # Debugging options.
 #
 options 	COUNT_XINVLTLB_HITS	# Counters for TLB events
 options 	COUNT_IPIS		# Per-CPU IPI interrupt counters
 
 
 
 #####################################################################
 # CPU OPTIONS
 
 #
 # You must specify at least one CPU (the one you intend to run on);
 # deleting the specification for CPUs you don't need to use may make
 # parts of the system run faster.
 #
 cpu		I486_CPU
 cpu		I586_CPU		# aka Pentium(tm)
 cpu		I686_CPU		# aka Pentium Pro(tm)
 
 #
 # Options for CPU features.
 #
 # CPU_ATHLON_SSE_HACK tries to enable SSE instructions when the BIOS has
 # forgotten to enable them.
 #
 # CPU_BLUELIGHTNING_3X enables triple-clock mode on IBM Blue Lightning
 # CPU if CPU supports it.  The default is double-clock mode on
 # BlueLightning CPU box.
 #
 # CPU_BLUELIGHTNING_FPU_OP_CACHE enables FPU operand cache on IBM
 # BlueLightning CPU.  It works only with Cyrix FPU, and this option
 # should not be used with Intel FPU.
 #
 # CPU_BTB_EN enables branch target buffer on Cyrix 5x86 (NOTE 1).
 #
 # CPU_CYRIX_NO_LOCK enables weak locking for the entire address space
 # of Cyrix 6x86 and 6x86MX CPUs by setting the NO_LOCK bit of CCR1.
 # Otherwise, the NO_LOCK bit of CCR1 is cleared.  (NOTE 3)
 #
 # CPU_DIRECT_MAPPED_CACHE sets L1 cache of Cyrix 486DLC CPU in direct
 # mapped mode.  Default is 2-way set associative mode.
 #
 # CPU_DISABLE_5X86_LSSER disables load store serialize (i.e., enables
 # reorder).  This option should not be used if you use memory mapped
 # I/O device(s).
 #
 # CPU_ELAN enables support for AMDs ElanSC520 CPU.
 #    CPU_ELAN_PPS enables precision timestamp code.
 #    CPU_ELAN_XTAL sets the clock crystal frequency in Hz.
 #
 # CPU_ENABLE_LONGRUN enables support for Transmeta Crusoe LongRun
 # technology which allows to restrict power consumption of the CPU by
 # using group of hw.crusoe.* sysctls.
 #
 # CPU_FASTER_5X86_FPU enables faster FPU exception handler.
 #
 # CPU_GEODE is for the SC1100 Geode embedded processor.  This option
 # is necessary because the i8254 timecounter is toast.
 #
 # CPU_I486_ON_386 enables CPU cache on i486 based CPU upgrade products
 # for i386 machines.
 #
 # CPU_IORT defines I/O clock delay time (NOTE 1).  Default values of
 # I/O clock delay time on Cyrix 5x86 and 6x86 are 0 and 7,respectively
 # (no clock delay).
 #
 # CPU_L2_LATENCY specifies the L2 cache latency value.  This option is used
 # only when CPU_PPRO2CELERON is defined and Mendocino Celeron is detected.
 # The default value is 5.
 #
 # CPU_LOOP_EN prevents flushing the prefetch buffer if the destination
 # of a jump is already present in the prefetch buffer on Cyrix 5x86(NOTE
 # 1).
 #
 # CPU_PPRO2CELERON enables L2 cache of Mendocino Celeron CPUs.  This option
 # is useful when you use Socket 8 to Socket 370 converter, because most Pentium
 # Pro BIOSs do not enable L2 cache of Mendocino Celeron CPUs.
 #
 # CPU_RSTK_EN enables return stack on Cyrix 5x86 (NOTE 1).
 #
 # CPU_SOEKRIS enables support www.soekris.com hardware.
 #
 # CPU_SUSP_HLT enables suspend on HALT.  If this option is set, CPU
 # enters suspend mode following execution of HALT instruction.
 #
 # CPU_UPGRADE_HW_CACHE eliminates unneeded cache flush instruction(s).
 #
 # CPU_WT_ALLOC enables write allocation on Cyrix 6x86/6x86MX and AMD
 # K5/K6/K6-2 CPUs.
 #
 # CYRIX_CACHE_WORKS enables CPU cache on Cyrix 486 CPUs with cache
 # flush at hold state.
 #
 # CYRIX_CACHE_REALLY_WORKS enables (1) CPU cache on Cyrix 486 CPUs
 # without cache flush at hold state, and (2) write-back CPU cache on
 # Cyrix 6x86 whose revision < 2.7 (NOTE 2).
 #
 # NO_F00F_HACK disables the hack that prevents Pentiums (and ONLY
 # Pentiums) from locking up when a LOCK CMPXCHG8B instruction is
 # executed.  This option is only needed if I586_CPU is also defined,
 # and should be included for any non-Pentium CPU that defines it.
 #
 # NO_MEMORY_HOLE is an optimisation for systems with AMD K6 processors
 # which indicates that the 15-16MB range is *definitely* not being
 # occupied by an ISA memory hole.
 #
 # NOTE 1: The options, CPU_BTB_EN, CPU_LOOP_EN, CPU_IORT,
 # CPU_LOOP_EN and CPU_RSTK_EN should not be used because of CPU bugs.
 # These options may crash your system.
 #
 # NOTE 2: If CYRIX_CACHE_REALLY_WORKS is not set, CPU cache is enabled
 # in write-through mode when revision < 2.7.  If revision of Cyrix
 # 6x86 >= 2.7, CPU cache is always enabled in write-back mode.
 #
 # NOTE 3: This option may cause failures for software that requires
 # locked cycles in order to operate correctly.
 #
 options 	CPU_ATHLON_SSE_HACK
 options 	CPU_BLUELIGHTNING_3X
 options 	CPU_BLUELIGHTNING_FPU_OP_CACHE
 options 	CPU_BTB_EN
 options 	CPU_DIRECT_MAPPED_CACHE
 options 	CPU_DISABLE_5X86_LSSER
 options 	CPU_ELAN
 options 	CPU_ELAN_PPS
 options 	CPU_ELAN_XTAL=32768000
 options 	CPU_ENABLE_LONGRUN
 options 	CPU_FASTER_5X86_FPU
 options 	CPU_GEODE
 options 	CPU_I486_ON_386
 options 	CPU_IORT
 options 	CPU_L2_LATENCY=5
 options 	CPU_LOOP_EN
 options 	CPU_PPRO2CELERON
 options 	CPU_RSTK_EN
 options 	CPU_SOEKRIS
 options 	CPU_SUSP_HLT
 options 	CPU_UPGRADE_HW_CACHE
 options 	CPU_WT_ALLOC
 options 	CYRIX_CACHE_WORKS
 options 	CYRIX_CACHE_REALLY_WORKS
 #options 	NO_F00F_HACK
 
 # Debug options
 options 	NPX_DEBUG	# enable npx debugging
 
 #
 # PERFMON causes the driver for Pentium/Pentium Pro performance counters
 # to be compiled.  See perfmon(4) for more information.
 #
 options 	PERFMON
 
 
 #####################################################################
 # NETWORKING OPTIONS
 
 #
 # DEVICE_POLLING adds support for mixed interrupt-polling handling
 # of network device drivers, which has significant benefits in terms
 # of robustness to overloads and responsivity, as well as permitting
 # accurate scheduling of the CPU time between kernel network processing
 # and other activities.  The drawback is a moderate (up to 1/HZ seconds)
 # potential increase in response times.
 # It is strongly recommended to use HZ=1000 or 2000 with DEVICE_POLLING
 # to achieve smoother behaviour.
 # Additionally, you can enable/disable polling at runtime with help of
 # the ifconfig(8) utility, and select the CPU fraction reserved to
 # userland with the sysctl variable kern.polling.user_frac
 # (default 50, range 0..100).
 #
 # Not all device drivers support this mode of operation at the time of
 # this writing.  See polling(4) for more details.
 
 options 	DEVICE_POLLING
 
 # BPF_JITTER adds support for BPF just-in-time compiler.
 
 options 	BPF_JITTER
 
 # OpenFabrics Enterprise Distribution (Infiniband).
 options 	OFED
 options 	OFED_DEBUG_INIT
 
 # Sockets Direct Protocol
 options 	SDP
 options 	SDP_DEBUG
 
 # IP over Infiniband
 options 	IPOIB
 options 	IPOIB_DEBUG
 options 	IPOIB_CM
 
 
 #####################################################################
 # CLOCK OPTIONS
 
 # Provide read/write access to the memory in the clock chip.
 device		nvram		# Access to rtc cmos via /dev/nvram
 
 
 #####################################################################
 # MISCELLANEOUS DEVICES AND OPTIONS
 
 device		speaker		#Play IBM BASIC-style noises out your speaker
 hint.speaker.0.at="isa"
 hint.speaker.0.port="0x61"
 device		gzip		#Exec gzipped a.out's.  REQUIRES COMPAT_AOUT!
 device		apm_saver	# Requires APM
 
 
 #####################################################################
 # HARDWARE BUS CONFIGURATION
 
 #
 # ISA bus
 #
 device		isa
 
 #
 # Options for `isa':
 #
 # AUTO_EOI_1 enables the `automatic EOI' feature for the master 8259A
 # interrupt controller.  This saves about 0.7-1.25 usec for each interrupt.
 # This option breaks suspend/resume on some portables.
 #
 # AUTO_EOI_2 enables the `automatic EOI' feature for the slave 8259A
 # interrupt controller.  This saves about 0.7-1.25 usec for each interrupt.
 # Automatic EOI is documented not to work for for the slave with the
 # original i8259A, but it works for some clones and some integrated
 # versions.
 #
 # MAXMEM specifies the amount of RAM on the machine; if this is not
 # specified, FreeBSD will first read the amount of memory from the CMOS
 # RAM, so the amount of memory will initially be limited to 64MB or 16MB
 # depending on the BIOS.  If the BIOS reports 64MB, a memory probe will
 # then attempt to detect the installed amount of RAM.  If this probe
 # fails to detect >64MB RAM you will have to use the MAXMEM option.
 # The amount is in kilobytes, so for a machine with 128MB of RAM, it would
 # be 131072 (128 * 1024).
 #
 # BROKEN_KEYBOARD_RESET disables the use of the keyboard controller to
 # reset the CPU for reboot.  This is needed on some systems with broken
 # keyboard controllers.
 
 options 	AUTO_EOI_1
 #options 	AUTO_EOI_2
 
 options 	MAXMEM=(128*1024)
 #options 	BROKEN_KEYBOARD_RESET
 
 #
 # AGP GART support
 device		agp
 
 # AGP debugging.
 options 	AGP_DEBUG
 
 
 #####################################################################
 # HARDWARE DEVICE CONFIGURATION
 
 # To include support for VGA VESA video modes
 options 	VESA
 
 # Turn on extra debugging checks and output for VESA support.
 options 	VESA_DEBUG
 
 device		dpms		# DPMS suspend & resume via VESA BIOS
 
 # x86 real mode BIOS emulator, required by atkbdc/dpms/vesa
 options 	X86BIOS
 
 #
 # Hints for the non-optional Numeric Processing eXtension driver.
 hint.npx.0.flags="0x0"
 hint.npx.0.irq="13"
 
 #
 # `flags' for npx0:
 #	0x01	don't use the npx registers to optimize bcopy.
 #	0x02	don't use the npx registers to optimize bzero.
 #	0x04	don't use the npx registers to optimize copyin or copyout.
 # The npx registers are normally used to optimize copying and zeroing when
 # all of the following conditions are satisfied:
 #	I586_CPU is an option
 #	the cpu is an i586 (perhaps not a Pentium)
 #	the probe for npx0 succeeds
 #	INT 16 exception handling works.
 # Then copying and zeroing using the npx registers is normally 30-100% faster.
 # The flags can be used to control cases where it doesn't work or is slower.
 # Setting them at boot time using hints works right (the optimizations
 # are not used until later in the bootstrap when npx0 is attached).
 # Flag 0x08 automatically disables the i586 optimized routines.
 #
 
 #
 # Optional devices:
 #
 
 # PS/2 mouse
 device		psm
 hint.psm.0.at="atkbdc"
 hint.psm.0.irq="12"
 
 # Options for psm:
 options 	PSM_HOOKRESUME		#hook the system resume event, useful
 					#for some laptops
 options 	PSM_RESETAFTERSUSPEND	#reset the device at the resume event
 
 # The keyboard controller; it controls the keyboard and the PS/2 mouse.
 device		atkbdc
 hint.atkbdc.0.at="isa"
 hint.atkbdc.0.port="0x060"
 
 # The AT keyboard
 device		atkbd
 hint.atkbd.0.at="atkbdc"
 hint.atkbd.0.irq="1"
 
 # Options for atkbd:
 options 	ATKBD_DFLT_KEYMAP	# specify the built-in keymap
 makeoptions	ATKBD_DFLT_KEYMAP=fr.dvorak
 
 # `flags' for atkbd:
 #       0x01    Force detection of keyboard, else we always assume a keyboard
 #       0x02    Don't reset keyboard, useful for some newer ThinkPads
 #	0x03	Force detection and avoid reset, might help with certain
 #		dockingstations
 #       0x04    Old-style (XT) keyboard support, useful for older ThinkPads
 
 # Video card driver for VGA adapters.
 device		vga
 hint.vga.0.at="isa"
 
 # Options for vga:
 # Try the following option if the mouse pointer is not drawn correctly
 # or font does not seem to be loaded properly.  May cause flicker on
 # some systems.
 options 	VGA_ALT_SEQACCESS
 
 # If you can dispense with some vga driver features, you may want to
 # use the following options to save some memory.
 #options 	VGA_NO_FONT_LOADING	# don't save/load font
 #options 	VGA_NO_MODE_CHANGE	# don't change video modes
 
 # Older video cards may require this option for proper operation.
 options 	VGA_SLOW_IOACCESS	# do byte-wide i/o's to TS and GDC regs
 
 # The following option probably won't work with the LCD displays.
 options 	VGA_WIDTH90		# support 90 column modes
 
 # Debugging.
 options 	VGA_DEBUG
 
 # vt(4) drivers.
 device		vt_vga
 
 # Linear framebuffer driver for S3 VESA 1.2 cards. Works on top of VESA.
 device		s3pci
 
 # 3Dfx Voodoo Graphics, Voodoo II /dev/3dfx CDEV support.  This will create
 # the /dev/3dfx0 device to work with glide implementations.  This should get
 # linked to /dev/3dfx and /dev/voodoo.  Note that this is not the same as
 # the tdfx DRI module from XFree86 and is completely unrelated.
 #
 # To enable Linuxulator support, one must also include COMPAT_LINUX in the
 # config as well.  The other option is to load both as modules.
 
 device		tdfx			# Enable 3Dfx Voodoo support
 device		tdfx_linux		# Enable Linuxulator support
 
 #
 # ACPI support using the Intel ACPI Component Architecture reference
 # implementation.
 #
 # ACPI_DEBUG enables the use of the debug.acpi.level and debug.acpi.layer
 # kernel environment variables to select initial debugging levels for the
 # Intel ACPICA code.  (Note that the Intel code must also have USE_DEBUGGER
 # defined when it is built).
 
 device		acpi
 options 	ACPI_DEBUG
 options 	ACPI_DMAR
 
 # ACPI WMI Mapping driver
 device		acpi_wmi
 
 # ACPI Asus Extras (LCD backlight/brightness, video output, etc.)
 device		acpi_asus
 
 # ACPI Fujitsu Extras (Buttons)
 device		acpi_fujitsu
 
 # ACPI extras driver for HP laptops
 device		acpi_hp
 
 # ACPI extras driver for IBM laptops
 device		acpi_ibm
 
 # ACPI Panasonic Extras (LCD backlight/brightness, video output, etc.)
 device		acpi_panasonic
 
 # ACPI Sony extra (LCD brightness)
 device		acpi_sony
 
 # ACPI Toshiba Extras (LCD backlight/brightness, video output, etc.)
 device		acpi_toshiba
 
 # ACPI Video Extensions (LCD backlight/brightness, video output, etc.)
 device		acpi_video
 
 # ACPI Docking Station
 device		acpi_dock
 
 # ACPI ASOC ATK0110 ASUSTeK AI Booster (voltage, temperature and fan sensors)
 device		aibs
 
 # The cpufreq(4) driver provides support for non-ACPI CPU frequency control
 device		cpufreq
 
 # Direct Rendering modules for 3D acceleration.
 device		drm		# DRM core module required by DRM drivers
 device		mach64drm	# ATI Rage Pro, Rage Mobility P/M, Rage XL
 device		mgadrm		# AGP Matrox G200, G400, G450, G550
 device		r128drm		# ATI Rage 128
 device		savagedrm	# S3 Savage3D, Savage4
 device		sisdrm		# SiS 300/305, 540, 630
 device		tdfxdrm		# 3dfx Voodoo 3/4/5 and Banshee
 device		viadrm		# VIA
 options 	DRM_DEBUG	# Include debug printfs (slow)
 
 #
 # mse: Logitech and ATI InPort bus mouse ports
 
 device		mse
 hint.mse.0.at="isa"
 hint.mse.0.port="0x23c"
 hint.mse.0.irq="5"
 
 #
 # Network interfaces:
 #
 
 # bxe:  Broadcom NetXtreme II (BCM5771X/BCM578XX) PCIe 10Gb Ethernet
 #       adapters.
 # ce:   Cronyx Tau-PCI/32 sync single/dual port G.703/E1 serial adaptor
 #       with 32 HDLC subchannels (requires sppp (default), or NETGRAPH if
 #       NETGRAPH_CRONYX is configured)
 # cp:   Cronyx Tau-PCI sync single/dual/four port
 #       V.35/RS-232/RS-530/RS-449/X.21/G.703/E1/E3/T3/STS-1
 #       serial adaptor (requires sppp (default), or NETGRAPH if
 #       NETGRAPH_CRONYX is configured)
 # cs:   IBM Etherjet and other Crystal Semi CS89x0-based adapters
 # ctau: Cronyx Tau sync dual port V.35/RS-232/RS-530/RS-449/X.21/G.703/E1
 #       serial adaptor (requires sppp (default), or NETGRAPH if
 #       NETGRAPH_CRONYX is configured)
 # ed:   Western Digital and SMC 80xx; Novell NE1000 and NE2000; 3Com 3C503
 #       HP PC Lan+, various PC Card devices
 #       (requires miibus)
 # ipw:	Intel PRO/Wireless 2100 IEEE 802.11 adapter
 # iwi:	Intel PRO/Wireless 2200BG/2225BG/2915ABG IEEE 802.11 adapters
 #	Requires the iwi firmware module
 # iwn:	Intel Wireless WiFi Link 1000/105/135/2000/4965/5000/6000/6050 abgn
 #	802.11 network adapters
 #	Requires the iwn firmware module
 # mthca: Mellanox HCA InfiniBand
 # mlx4ib: Mellanox ConnectX HCA InfiniBand
 # mlx4en: Mellanox ConnectX HCA Ethernet
 # nfe:	nVidia nForce MCP on-board Ethernet Networking (BSD open source)
 # sbni: Granch SBNI12-xx ISA and PCI adapters
 # vmx:	VMware VMXNET3 Ethernet (BSD open source)
 # wpi:	Intel 3945ABG Wireless LAN controller
 #	Requires the wpi firmware module
 
 # Order for ISA/EISA devices is important here
 
 device          bxe             # Broadcom NetXtreme II BCM5771X/BCM578XX 10GbE
 device		ce
 device		cp
 device		cs		# Crystal Semiconductor CS89x0 NIC
 hint.cs.0.at="isa"
 hint.cs.0.port="0x300"
 device		ctau
 hint.ctau.0.at="isa"
 hint.ctau.0.port="0x240"
 hint.ctau.0.irq="15"
 hint.ctau.0.drq="7"
 #options 	NETGRAPH_CRONYX		# Enable NETGRAPH support for Cronyx adapter(s)
 device		ed		# NE[12]000, SMC Ultra, 3c503, DS8390 cards
 options 	ED_3C503
 options 	ED_HPP
 options 	ED_SIC
 hint.ed.0.at="isa"
 hint.ed.0.port="0x280"
 hint.ed.0.irq="5"
 hint.ed.0.maddr="0xd8000"
 device		ipw		# Intel 2100 wireless NICs.
 device		iwi		# Intel 2200BG/2225BG/2915ABG wireless NICs.
 device		iwn		# Intel 4965/1000/5000/6000 wireless NICs.
 # Hint for the i386-only ISA front-end of le(4).
 hint.le.0.at="isa"
 hint.le.0.port="0x280"
 hint.le.0.irq="10"
 hint.le.0.drq="0"
 device  	mthca		# Mellanox HCA InfiniBand
 device		mlx4		# Shared code module between IB and Ethernet
 device  	mlx4ib		# Mellanox ConnectX HCA InfiniBand
 device  	mlx4en		# Mellanox ConnectX HCA Ethernet
 device		nfe		# nVidia nForce MCP on-board Ethernet
 device		sbni
 hint.sbni.0.at="isa"
 hint.sbni.0.port="0x210"
 hint.sbni.0.irq="0xefdead"
 hint.sbni.0.flags="0"
 device		vmx		# VMware VMXNET3 Ethernet
 device		wpi		# Intel 3945ABG wireless NICs.
 
 # IEEE 802.11 adapter firmware modules
 
 # Intel PRO/Wireless 2100 firmware:
 #   ipwfw:		BSS/IBSS/monitor mode firmware
 #   ipwbssfw:		BSS mode firmware
 #   ipwibssfw:		IBSS mode firmware
 #   ipwmonitorfw:	Monitor mode firmware
 # Intel PRO/Wireless 2200BG/2225BG/2915ABG firmware:
 #   iwifw:		BSS/IBSS/monitor mode firmware
 #   iwibssfw:		BSS mode firmware
 #   iwiibssfw:		IBSS mode firmware
 #   iwimonitorfw:	Monitor mode firmware
 # Intel Wireless WiFi Link 4965/1000/5000/6000 series firmware:
 #   iwnfw:		Single module to support all devices
 #   iwn1000fw:		Specific module for the 1000 only
 #   iwn105fw:		Specific module for the 105 only
 #   iwn135fw:		Specific module for the 135 only
 #   iwn2000fw:		Specific module for the 2000 only
 #   iwn2030fw:		Specific module for the 2030 only
 #   iwn4965fw:		Specific module for the 4965 only
 #   iwn5000fw:		Specific module for the 5000 only
 #   iwn5150fw:		Specific module for the 5150 only
 #   iwn6000fw:		Specific module for the 6000 only
 #   iwn6000g2afw:	Specific module for the 6000g2a only
 #   iwn6000g2bfw:	Specific module for the 6000g2b only
 #   iwn6050fw:		Specific module for the 6050 only
 # wpifw:	Intel 3945ABG Wireless LAN Controller firmware
 
 device		iwifw
 device		iwibssfw
 device		iwiibssfw
 device		iwimonitorfw
 device		ipwfw
 device		ipwbssfw
 device		ipwibssfw
 device		ipwmonitorfw
 device		iwnfw
 device		iwn1000fw
 device		iwn105fw
 device		iwn135fw
 device		iwn2000fw
 device		iwn2030fw
 device		iwn4965fw
 device		iwn5000fw
 device		iwn5150fw
 device		iwn6000fw
 device		iwn6000g2afw
 device		iwn6000g2bfw
 device		iwn6050fw
 device		wpifw
 
 #
 # Non-Transparent Bridge (NTB) drivers
 #
 device		if_ntb		# Virtual NTB network interface
 device		ntb_transport	# NTB packet transport driver
 device		ntb		# NTB hardware interface
 device		ntb_hw_intel	# Intel NTB hardware driver
 device		ntb_hw_plx	# PLX NTB hardware driver
 
 #
 # ATA raid adapters
 #
 device		pst
 
 #
 # Areca 11xx and 12xx series of SATA II RAID controllers.
 # CAM is required.
 #
 device		arcmsr		# Areca SATA II RAID
 
 #
 # 3ware 9000 series PATA/SATA RAID controller driver and options.
 # The driver is implemented as a SIM, and so, needs the CAM infrastructure.
 #
 options 	TWA_DEBUG		# 0-10; 10 prints the most messages.
 device		twa			# 3ware 9000 series PATA/SATA RAID
 
 #
 # SCSI host adapters:
 #
 # ncv: NCR 53C500 based SCSI host adapters.
 # nsp: Workbit Ninja SCSI-3 based PC Card SCSI host adapters.
 # stg: TMC 18C30, 18C50 based SCSI host adapters.
 
 device		ncv
 device		nsp
 device		stg
 hint.stg.0.at="isa"
 hint.stg.0.port="0x140"
 hint.stg.0.port="11"
 
 #
 # Adaptec FSA RAID controllers, including integrated DELL controllers,
 # the Dell PERC 2/QC and the HP NetRAID-4M
 device		aac
 device		aacp	# SCSI Passthrough interface (optional, CAM required)
 
 #
 # Adaptec by PMC RAID controllers, Series 6/7/8 and upcoming families
 device		aacraid		# Container interface, CAM required
 
 #
 # Highpoint RocketRAID 27xx.
 device		hpt27xx
 
 #
 # Highpoint RocketRAID 182x.
 device		hptmv
 
 #
 # Highpoint DC7280 and R750.
 device		hptnr
 
 #
 # Highpoint RocketRAID.  Supports RR172x, RR222x, RR2240, RR232x, RR2340,
 # RR2210, RR174x, RR2522, RR231x, RR230x.
 device		hptrr
 
 #
 # Highpoint RocketRaid 3xxx series SATA RAID
 device		hptiop
 
 #
 # Intel integrated Memory Controller (iMC) SMBus controller
 #	Sandybridge-Xeon, Ivybridge-Xeon, Haswell-Xeon, Broadwell-Xeon
 device		imcsmb
 
 #
 # IBM (now Adaptec) ServeRAID controllers
 device		ips
 
 #
 # Intel C600 (Patsburg) integrated SAS controller
 device		isci
 options 	ISCI_LOGGING	# enable debugging in isci HAL
 
 #
 # NVM Express (NVMe) support
 device         nvme    # base NVMe driver
 device         nvd     # expose NVMe namespaces as disks, depends on nvme
 
 #
 # PMC-Sierra SAS/SATA controller
 device		pmspcv
 #
 # SafeNet crypto driver: can be moved to the MI NOTES as soon as
 # it's tested on a big-endian machine
 #
 device		safe		# SafeNet 1141
 options 	SAFE_DEBUG	# enable debugging support: hw.safe.debug
 options 	SAFE_RNDTEST	# enable rndtest support
 
 #
 # glxiic is an I2C driver for the AMD Geode LX CS5536 System Management Bus
 # controller.  Requires 'device iicbus'.
 #
 device		glxiic		# AMD Geode LX CS5536 System Management Bus
 
 #
 # glxsb is a driver for the Security Block in AMD Geode LX processors.
 # Requires 'device crypto'.
 #
 device		glxsb		# AMD Geode LX Security Block
 
 #
 # VirtIO support
 #
 # The virtio entry provides a generic bus for use by the device drivers.
 # It must be combined with an interface that communicates with the host.
 # Multiple such interfaces defined by the VirtIO specification. FreeBSD
 # only has support for PCI. Therefore, virtio_pci must be statically
 # compiled in or loaded as a module for the device drivers to function.
 #
 device		virtio		# Generic VirtIO bus (required)
 device		virtio_pci	# VirtIO PCI Interface
 device		vtnet		# VirtIO Ethernet device
 device		virtio_blk	# VirtIO Block device
 device		virtio_scsi	# VirtIO SCSI device
 device		virtio_balloon	# VirtIO Memory Balloon device
 device		virtio_random	# VirtIO Entropy device
 device		virtio_console	# VirtIO Console device
 
 device 		hyperv		# HyperV drivers
 
 #####################################################################
 
 #
 # Miscellaneous hardware:
 #
 # apm: Laptop Advanced Power Management (experimental)
 # ipmi: Intelligent Platform Management Interface
 # smapi: System Management Application Program Interface driver
 # smbios: DMI/SMBIOS entry point
 # vpd: Vital Product Data kernel interface
 # pbio: Parallel (8255 PPI) basic I/O (mode 0) port (e.g. Advantech PCL-724)
 # asmc: Apple System Management Controller
 # si: Specialix International SI/XIO or SX intelligent serial card driver
 # tpm: Trusted Platform Module
 
 # Notes on APM
 #  The flags takes the following meaning for apm0:
 #    0x0020  Statclock is broken.
 
 # Notes on the Specialix SI/XIO driver:
 #  The host card is memory, not IO mapped.
 #  The Rev 1 host cards use a 64K chunk, on a 32K boundary.
 #  The Rev 2 host cards use a 32K chunk, on a 32K boundary.
 #  The cards can use an IRQ of 11, 12 or 15.
 
 # Notes on the Sony Programmable I/O controller
 #  This is a temporary driver that should someday be replaced by something
 #  that hooks into the ACPI layer.  The device is hooked to the PIIX4's
 #  General Device 10 decoder, which means you have to fiddle with PCI
 #  registers to map it in, even though it is otherwise treated here as
 #  an ISA device.  At the moment, the driver polls, although the device
 #  is capable of generating interrupts.  It largely undocumented.
 #  The port location in the hint is where you WANT the device to be
 #  mapped.  0x10a0 seems to be traditional.  At the moment the jogdial
 #  is the only thing truly supported, but apparently a fair percentage
 #  of the Vaio extra features are controlled by this device.
 
 device		apm
 hint.apm.0.flags="0x20"
 device		ipmi
 device		smapi
 device		smbios
 device		vpd
 device		pbio
 hint.pbio.0.at="isa"
 hint.pbio.0.port="0x360"
 device		asmc
 device		tpm
 device		padlock_rng	# VIA Padlock RNG
 device		rdrand_rng	# Intel Bull Mountain RNG
 device		aesni		# AES-NI OpenCrypto module
 
 #
 # Laptop/Notebook options:
 #
 # See also:
 #  apm under `Miscellaneous hardware'
 # above.
 
 # For older notebooks that signal a powerfail condition (external
 # power supply dropped, or battery state low) by issuing an NMI:
 
 options 	POWERFAIL_NMI	# make it beep instead of panicing
 
 #
 # I2C Bus
 #
 # Philips i2c bus support is provided by the `iicbus' device.
 #
 # Supported interfaces:
 # pcf	Philips PCF8584 ISA-bus controller
 #
 device		pcf
 hint.pcf.0.at="isa"
 hint.pcf.0.port="0x320"
 hint.pcf.0.irq="5"
 
 #
 # Hardware watchdog timers:
 #
 # ichwd: Intel ICH watchdog timer
 # amdsbwd: AMD SB7xx watchdog timer
 # viawd: VIA south bridge watchdog timer
 # wbwd: Winbond watchdog timer
 #
 device		ichwd
 device		amdsbwd
 device		viawd
 device		wbwd
 
 #
 # Temperature sensors:
 #
 # coretemp: on-die sensor on Intel Core and newer CPUs
 # amdtemp: on-die sensor on AMD K8/K10/K11 CPUs
 #
 device		coretemp
 device		amdtemp
 
 #
 # CPU control pseudo-device. Provides access to MSRs, CPUID info and
 # microcode update feature.
 #
 device		cpuctl
 
 #
 # System Management Bus (SMB)
 #
 options 	ENABLE_ALART		# Control alarm on Intel intpm driver
 
 #
 # Set the number of PV entries per process.  Increasing this can
 # stop panics related to heavy use of shared memory.  However, that can
 # (combined with large amounts of physical memory) cause panics at
 # boot time due the kernel running out of VM space.
 #
 # If you're tweaking this, you might also want to increase the sysctls
 # "vm.v_free_min", "vm.v_free_reserved", and "vm.v_free_target".
 #
 # The value below is the one more than the default.
 #
 options 	PMAP_SHPGPERPROC=201
 
 #
-# Change the size of the kernel virtual address space.  Due to
-# constraints in loader(8) on i386, this must be a multiple of 4.
-# 256 = 1 GB of kernel address space.  Increasing this also causes
-# a reduction of the address space in user processes.  512 splits
-# the 4GB cpu address space in half (2GB user, 2GB kernel).  For PAE
-# kernels, the value will need to be double non-PAE.  A value of 1024
-# for PAE kernels is necessary to split the address space in half.
-# This will likely need to be increased to handle memory sizes >4GB.
-# PAE kernels default to a value of 512.
-#
-options 	KVA_PAGES=260
-
-#
 # Number of initial kernel page table pages used for early bootstrap.
 # This number should include enough pages to map the kernel, any
 # modules or other data loaded with the kernel by the loader, and data
 # structures allocated before the VM system is initialized such as the
 # vm_page_t array.  Each page table page maps 4MB (2MB with PAE).
 #
 options 	NKPT=31
 
 
 #####################################################################
 # ABI Emulation
 
 # Enable iBCS2 runtime support for SCO and ISC binaries
 #options 	IBCS2
 
 # Emulate spx device for client side of SVR3 local X interface
 options 	SPX_HACK
 
 # Enable 32-bit runtime support for CloudABI binaries.
 options 	COMPAT_CLOUDABI32
 
 # Enable Linux ABI emulation
 options 	COMPAT_LINUX
 
 # Enable i386 a.out binary support
 options 	COMPAT_AOUT
 
 # Enable the linux-like proc filesystem support (requires COMPAT_LINUX
 # and PSEUDOFS)
 options 	LINPROCFS
 
 #Enable the linux-like sys filesystem support (requires COMPAT_LINUX
 # and PSEUDOFS)
 options 	LINSYSFS
 
 # Enable NDIS binary driver support
 options 	NDISAPI
 device		ndis
 
 
 #####################################################################
 # VM OPTIONS
-
-# Disable the 4 MByte page PSE CPU feature.  The PSE feature allows the
-# kernel to use 4 MByte pages to map the kernel instead of 4k pages.
-# This saves on the amount of memory needed for page tables needed to
-# map the kernel.  You should only disable this feature as a temporary
-# workaround if you are having problems with it enabled.
-#
-#options 	DISABLE_PSE
-
-# Disable the global pages PGE CPU feature.  The PGE feature allows pages
-# to be marked with the PG_G bit.  TLB entries for these pages are not
-# flushed from the cache when %cr3 is reloaded.  This can make context
-# switches less expensive.  You should only disable this feature as a
-# temporary workaround if you are having problems with it enabled.
-#
-#options 	DISABLE_PG_G
 
 # KSTACK_PAGES is the number of memory pages to assign to the kernel
 # stack of each thread.
 
 options 	KSTACK_PAGES=5
 
 # Enable detailed accounting by the PV entry allocator.
 
 options 	PV_STATS
 
 #####################################################################
 
 # More undocumented options for linting.
 # Note that documenting these are not considered an affront.
 
 options 	FB_INSTALL_CDEV		# install a CDEV entry in /dev
 
 options 	I586_PMC_GUPROF=0x70000
 options 	KBDIO_DEBUG=2
 options 	KBD_MAXRETRY=4
 options 	KBD_MAXWAIT=6
 options 	KBD_RESETDELAY=201
 
 options 	PSM_DEBUG=1
 
 options 	TIMER_FREQ=((14318182+6)/12)
 
 options 	VM_KMEM_SIZE
 options 	VM_KMEM_SIZE_MAX
 options 	VM_KMEM_SIZE_SCALE
 
 
Index: head/sys/i386/i386/apic_vector.s
===================================================================
--- head/sys/i386/i386/apic_vector.s	(revision 332488)
+++ head/sys/i386/i386/apic_vector.s	(revision 332489)
@@ -1,318 +1,333 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
  * $FreeBSD$
  */
 
 /*
  * Interrupt entry points for external interrupts triggered by I/O APICs
  * as well as IPI handlers.
  */
 
 #include "opt_smp.h"
 
 #include <machine/asmacros.h>
+#include <machine/psl.h>
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
 #include "assym.inc"
 
 	.text
 	SUPERALIGN_TEXT
 	/* End Of Interrupt to APIC */
 as_lapic_eoi:
 	cmpl	$0,x2apic_mode
 	jne	1f
 	movl	lapic_map,%eax
 	movl	$0,LA_EOI(%eax)
 	ret
 1:
 	movl	$MSR_APIC_EOI,%ecx
 	xorl	%eax,%eax
 	xorl	%edx,%edx
 	wrmsr
 	ret
 
 /*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
  * in the ISR.  The handler determines the highest bit set in the ISR,
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
-#define	ISR_VEC(index, vec_name)					\
-	.text ;								\
-	SUPERALIGN_TEXT ;						\
-IDTVEC(vec_name ## _pti) ;						\
-IDTVEC(vec_name) ;							\
-	PUSH_FRAME ;							\
-	SET_KERNEL_SREGS ;						\
-	cld ;								\
-	FAKE_MCOUNT(TF_EIP(%esp)) ;					\
-	cmpl	$0,x2apic_mode ;					\
-	je	1f ;							\
-	movl	$(MSR_APIC_ISR0 + index),%ecx ;				\
-	rdmsr ;								\
-	jmp	2f ;							\
-1: ;									\
-	movl	lapic_map, %edx ;/* pointer to local APIC */		\
-	movl	LA_ISR + 16 * (index)(%edx), %eax ;	/* load ISR */	\
-2: ;									\
-	bsrl	%eax, %eax ;	/* index of highest set bit in ISR */	\
-	jz	3f ;							\
-	addl	$(32 * index),%eax ;					\
-	pushl	%esp		;                                       \
-	pushl	%eax ;		/* pass the IRQ */			\
-	call	lapic_handle_intr ;					\
-	addl	$8, %esp ;	/* discard parameter */			\
-3: ;									\
-	MEXITCOUNT ;							\
+	.macro	ISR_VEC	index, vec_name
+	.text
+	SUPERALIGN_TEXT
+	.globl	X\()\vec_name\()_pti, X\()\vec_name
+
+X\()\vec_name\()_pti:
+X\()\vec_name:
+	PUSH_FRAME
+	SET_KERNEL_SREGS
+	cld
+	KENTER
+	FAKE_MCOUNT(TF_EIP(%esp))
+	cmpl	$0,x2apic_mode
+	je	2f
+	movl	$(MSR_APIC_ISR0 + \index),%ecx
+	rdmsr
+	jmp	3f
+2:
+	movl	lapic_map, %edx		/* pointer to local APIC */
+	movl	LA_ISR + 16 * \index(%edx), %eax	/* load ISR */
+3:
+	bsrl	%eax, %eax	/* index of highest set bit in ISR */
+	jz	4f
+	addl	$(32 * \index),%eax
+	pushl	%esp
+	pushl	%eax		/* pass the IRQ */
+	movl	$lapic_handle_intr, %eax
+	call	*%eax
+	addl	$8, %esp	/* discard parameter */
+4:
+	MEXITCOUNT
 	jmp	doreti
+	.endm
 
 /*
  * Handle "spurious INTerrupts".
  * Notes:
  *  This is different than the "spurious INTerrupt" generated by an
  *   8259 PIC for missing INTs.  See the APIC documentation for details.
  *  This routine should NOT do an 'EOI' cycle.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
 
 	/* No EOI cycle used here */
 
 	iret
 
-	ISR_VEC(1, apic_isr1)
-	ISR_VEC(2, apic_isr2)
-	ISR_VEC(3, apic_isr3)
-	ISR_VEC(4, apic_isr4)
-	ISR_VEC(5, apic_isr5)
-	ISR_VEC(6, apic_isr6)
-	ISR_VEC(7, apic_isr7)
+	ISR_VEC	1, apic_isr1
+	ISR_VEC	2, apic_isr2
+	ISR_VEC	3, apic_isr3
+	ISR_VEC	4, apic_isr4
+	ISR_VEC	5, apic_isr5
+	ISR_VEC	6, apic_isr6
+	ISR_VEC	7, apic_isr7
 
 /*
  * Local APIC periodic timer handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(timerint_pti)
 IDTVEC(timerint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
+	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
-	call	lapic_handle_timer
+	movl	$lapic_handle_timer, %eax
+	call	*%eax
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC CMCI handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cmcint_pti)
 IDTVEC(cmcint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
+	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
-	call	lapic_handle_cmc
+	movl	$lapic_handle_cmc, %eax
+	call	*%eax
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC error interrupt handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(errorint_pti)
 IDTVEC(errorint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
+	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
-	call	lapic_handle_error
+	movl	$lapic_handle_error, %eax
+	call	*%eax
 	MEXITCOUNT
 	jmp	doreti
 
 #ifdef XENHVM
 /*
  * Xen event channel upcall interrupt handler.
  * Only used when the hypervisor supports direct vector callbacks.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(xen_intr_upcall)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
+	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
-	call	xen_intr_handle_upcall
+	movl	$xen_intr_handle_upcall, %eax
+	call	*%eax
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 #endif
 
 #ifdef SMP
 /*
  * Global address space TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 invltlb_ret:
 	call	as_lapic_eoi
 	jmp	doreti
 
 	SUPERALIGN_TEXT
 IDTVEC(invltlb)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
-
-	call	invltlb_handler
-
+	KENTER
+	movl	$invltlb_handler, %eax
+	call	*%eax
 	jmp	invltlb_ret
 
 /*
  * Single page TLB shootdown
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlpg)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
-
-	call	invlpg_handler
-
+	KENTER
+	movl	$invlpg_handler, %eax
+	call	*%eax
 	jmp	invltlb_ret
 
 /*
  * Page range TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlrng)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
-
-	call	invlrng_handler
-
+	KENTER
+	movl	$invlrng_handler, %eax
+	call	*%eax
 	jmp	invltlb_ret
 
 /*
  * Invalidate cache.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlcache)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
-
-	call	invlcache_handler
-
+	KENTER
+	movl	$invlcache_handler, %eax
+	call	*%eax
 	jmp	invltlb_ret
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(ipi_intr_bitmap_handler)	
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
-
+	KENTER
 	call	as_lapic_eoi
-	
 	FAKE_MCOUNT(TF_EIP(%esp))
-
-	call	ipi_bitmap_handler
+	movl	$ipi_bitmap_handler, %eax
+	call	*%eax
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpustop)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
-
+	KENTER
 	call	as_lapic_eoi
-	call	cpustop_handler
+	movl	$cpustop_handler, %eax
+	call	*%eax
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpususpend)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
-
+	KENTER
 	call	as_lapic_eoi
-	call	cpususpend_handler
+	movl	$cpususpend_handler, %eax
+	call	*%eax
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
  *
  * - Calls the generic rendezvous action function.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(rendezvous)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
-
+	KENTER
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movl	ipi_rendezvous_counts(,%eax,4), %eax
 	incl	(%eax)
 #endif
-	call	smp_rendezvous_action
-
+	movl	$smp_rendezvous_action, %eax
+	call	*%eax
 	call	as_lapic_eoi
 	jmp	doreti
 	
 #endif /* SMP */
Index: head/sys/i386/i386/atpic_vector.s
===================================================================
--- head/sys/i386/i386/atpic_vector.s	(revision 332488)
+++ head/sys/i386/i386/atpic_vector.s	(revision 332489)
@@ -1,79 +1,84 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
  * $FreeBSD$
  */
 
 /*
  * Interrupt entry points for external interrupts triggered by the 8259A
  * master and slave interrupt controllers.
  */
 
+#include <machine/psl.h>
 #include <machine/asmacros.h>
 
 #include "assym.inc"
 
 /*
  * Macros for interrupt entry, call to handler, and exit.
  */
-#define	INTR(irq_num, vec_name) \
-	.text ;								\
-	SUPERALIGN_TEXT ;						\
-IDTVEC(vec_name ##_pti) ;						\
-IDTVEC(vec_name) ;							\
-	PUSH_FRAME ;							\
-	SET_KERNEL_SREGS ;						\
-	cld ;								\
-;									\
-	FAKE_MCOUNT(TF_EIP(%esp)) ;					\
-	pushl	%esp		;                                       \
-	pushl	$irq_num; 	/* pass the IRQ */			\
-	call	atpic_handle_intr ;					\
-	addl	$8, %esp ;	/* discard the parameters */		\
-;									\
-	MEXITCOUNT ;							\
+	.macro	INTR	irq_num, vec_name
+	.text
+	SUPERALIGN_TEXT
+	.globl	X\()\vec_name\()_pti, X\()\vec_name
+
+X\()\vec_name\()_pti:
+X\()\vec_name:
+	PUSH_FRAME
+	SET_KERNEL_SREGS
+	cld
+	KENTER
+	FAKE_MCOUNT(TF_EIP(%esp))
+	pushl	%esp
+	pushl	$\irq_num 	/* pass the IRQ */
+	movl	$atpic_handle_intr, %eax
+	call	*%eax
+	addl	$8, %esp	/* discard the parameters */
+
+	MEXITCOUNT
 	jmp	doreti
+	.endm
 
-	INTR(0, atpic_intr0)
-	INTR(1, atpic_intr1)
-	INTR(2, atpic_intr2)
-	INTR(3, atpic_intr3)
-	INTR(4, atpic_intr4)
-	INTR(5, atpic_intr5)
-	INTR(6, atpic_intr6)
-	INTR(7, atpic_intr7)
-	INTR(8, atpic_intr8)
-	INTR(9, atpic_intr9)
-	INTR(10, atpic_intr10)
-	INTR(11, atpic_intr11)
-	INTR(12, atpic_intr12)
-	INTR(13, atpic_intr13)
-	INTR(14, atpic_intr14)
-	INTR(15, atpic_intr15)
+	INTR	0, atpic_intr0
+	INTR	1, atpic_intr1
+	INTR	2, atpic_intr2
+	INTR	3, atpic_intr3
+	INTR	4, atpic_intr4
+	INTR	5, atpic_intr5
+	INTR	6, atpic_intr6
+	INTR	7, atpic_intr7
+	INTR	8, atpic_intr8
+	INTR	9, atpic_intr9
+	INTR	10, atpic_intr10
+	INTR	11, atpic_intr11
+	INTR	12, atpic_intr12
+	INTR	13, atpic_intr13
+	INTR	14, atpic_intr14
+	INTR	15, atpic_intr15
Index: head/sys/i386/i386/bios.c
===================================================================
--- head/sys/i386/i386/bios.c	(revision 332488)
+++ head/sys/i386/i386/bios.c	(revision 332489)
@@ -1,770 +1,750 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997 Michael Smith
  * Copyright (c) 1998 Jonathan Lemon
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Code for dealing with the BIOS in x86 PC systems.
  */
 
 #include "opt_isa.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/pcpu.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <machine/md_var.h>
 #include <machine/segments.h>
 #include <machine/stdarg.h>
 #include <machine/vmparam.h>
 #include <machine/pc/bios.h>
 #ifdef DEV_ISA
 #include <isa/isavar.h>
 #include <isa/pnpreg.h>
 #include <isa/pnpvar.h>
 #endif
 
 #define BIOS_START	0xe0000
 #define BIOS_SIZE	0x20000
 
 /* exported lookup results */
 struct bios32_SDentry		PCIbios;
 
 static struct PnPBIOS_table	*PnPBIOStable;
 
 static u_int			bios32_SDCI;
 
 /* start fairly early */
 static void			bios32_init(void *junk);
 SYSINIT(bios32, SI_SUB_CPU, SI_ORDER_ANY, bios32_init, NULL);
 
 /*
  * bios32_init
  *
  * Locate various bios32 entities.
  */
 static void
 bios32_init(void *junk)
 {
     u_long			sigaddr;
     struct bios32_SDheader	*sdh;
     struct PnPBIOS_table	*pt;
     u_int8_t			ck, *cv;
     int				i;
     char			*p;
     
     /*
      * BIOS32 Service Directory, PCI BIOS
      */
     
     /* look for the signature */
     if ((sigaddr = bios_sigsearch(0, "_32_", 4, 16, 0)) != 0) {
 
 	/* get a virtual pointer to the structure */
 	sdh = (struct bios32_SDheader *)(uintptr_t)BIOS_PADDRTOVADDR(sigaddr);
 	for (cv = (u_int8_t *)sdh, ck = 0, i = 0; i < (sdh->len * 16); i++) {
 	    ck += cv[i];
 	}
 	/* If checksum is OK, enable use of the entrypoint */
 	if ((ck == 0) && (BIOS_START <= sdh->entry ) &&
 	    (sdh->entry < (BIOS_START + BIOS_SIZE))) {
 	    bios32_SDCI = BIOS_PADDRTOVADDR(sdh->entry);
 	    if (bootverbose) {
 		printf("bios32: Found BIOS32 Service Directory header at %p\n", sdh);
 		printf("bios32: Entry = 0x%x (%x)  Rev = %d  Len = %d\n", 
 		       sdh->entry, bios32_SDCI, sdh->revision, sdh->len);
 	    }
 
 	    /* Allow user override of PCI BIOS search */
 	    if (((p = kern_getenv("machdep.bios.pci")) == NULL) || strcmp(p, "disable")) {
 
 		/* See if there's a PCI BIOS entrypoint here */
 		PCIbios.ident.id = 0x49435024;	/* PCI systems should have this */
 		if (!bios32_SDlookup(&PCIbios) && bootverbose)
 		    printf("pcibios: PCI BIOS entry at 0x%x+0x%x\n", PCIbios.base, PCIbios.entry);
 	    }
 	    if (p != NULL)
 		    freeenv(p);
 	} else {
 	    printf("bios32: Bad BIOS32 Service Directory\n");
 	}
     }
 
     /*
      * PnP BIOS
      *
      * Allow user override of PnP BIOS search
      */
     if ((((p = kern_getenv("machdep.bios.pnp")) == NULL) || strcmp(p, "disable")) &&
 	((sigaddr = bios_sigsearch(0, "$PnP", 4, 16, 0)) != 0)) {
 
 	/* get a virtual pointer to the structure */
 	pt = (struct PnPBIOS_table *)(uintptr_t)BIOS_PADDRTOVADDR(sigaddr);
 	for (cv = (u_int8_t *)pt, ck = 0, i = 0; i < pt->len; i++) {
 	    ck += cv[i];
 	}
 	/* If checksum is OK, enable use of the entrypoint */
 	if (ck == 0) {
 	    PnPBIOStable = pt;
 	    if (bootverbose) {
 		printf("pnpbios: Found PnP BIOS data at %p\n", pt);
 		printf("pnpbios: Entry = %x:%x  Rev = %d.%d\n", 
 		       pt->pmentrybase, pt->pmentryoffset, pt->version >> 4, pt->version & 0xf);
 		if ((pt->control & 0x3) == 0x01)
 		    printf("pnpbios: Event flag at %x\n", pt->evflagaddr);
 		if (pt->oemdevid != 0)
 		    printf("pnpbios: OEM ID %x\n", pt->oemdevid);
 		
 	    }
 	} else {
 	    printf("pnpbios: Bad PnP BIOS data checksum\n");
 	}
     }
     if (p != NULL)
 	    freeenv(p);
     if (bootverbose) {
 	    /* look for other know signatures */
 	    printf("Other BIOS signatures found:\n");
     }
 }
 
 /*
  * bios32_SDlookup
  *
  * Query the BIOS32 Service Directory for the service named in (ent),
  * returns nonzero if the lookup fails.  The caller must fill in
  * (ent->ident), the remainder are populated on a successful lookup.
  */
 int
 bios32_SDlookup(struct bios32_SDentry *ent)
 {
     struct bios_regs args;
 
     if (bios32_SDCI == 0)
 	return (1);
 
     args.eax = ent->ident.id;		/* set up arguments */
     args.ebx = args.ecx = args.edx = 0;
     bios32(&args, bios32_SDCI, GSEL(GCODE_SEL, SEL_KPL));
     if ((args.eax & 0xff) == 0) {	/* success? */
 	ent->base = args.ebx;
 	ent->len = args.ecx;
 	ent->entry = args.edx;
 	ent->ventry = BIOS_PADDRTOVADDR(ent->base + ent->entry);
 	return (0);			/* all OK */
     }
     return (1);				/* failed */
 }
 
 
 /*
  * bios_sigsearch
  *
  * Search some or all of the BIOS region for a signature string.
  *
  * (start)	Optional offset returned from this function 
  *		(for searching for multiple matches), or NULL
  *		to start the search from the base of the BIOS.
  *		Note that this will be a _physical_ address in
  *		the range 0xe0000 - 0xfffff.
  * (sig)	is a pointer to the byte(s) of the signature.
  * (siglen)	number of bytes in the signature.
  * (paralen)	signature paragraph (alignment) size.
  * (sigofs)	offset of the signature within the paragraph.
  *
  * Returns the _physical_ address of the found signature, 0 if the
  * signature was not found.
  */
 
 u_int32_t
 bios_sigsearch(u_int32_t start, u_char *sig, int siglen, int paralen, int sigofs)
 {
     u_char	*sp, *end;
     
     /* compute the starting address */
     if ((start >= BIOS_START) && (start <= (BIOS_START + BIOS_SIZE))) {
 	sp = (char *)BIOS_PADDRTOVADDR(start);
     } else if (start == 0) {
 	sp = (char *)BIOS_PADDRTOVADDR(BIOS_START);
     } else {
 	return 0;				/* bogus start address */
     }
 
     /* compute the end address */
     end = (u_char *)BIOS_PADDRTOVADDR(BIOS_START + BIOS_SIZE);
 
     /* loop searching */
     while ((sp + sigofs + siglen) < end) {
 	
 	/* compare here */
 	if (!bcmp(sp + sigofs, sig, siglen)) {
 	    /* convert back to physical address */
 	    return((u_int32_t)BIOS_VADDRTOPADDR(sp));
 	}
 	sp += paralen;
     }
     return(0);
 }
 
 /*
  * do not staticize, used by bioscall.s
  */
 union {
     struct {
 	u_short	offset;
 	u_short	segment;
     } vec16;
     struct {
 	u_int	offset;
 	u_short	segment;
     } vec32;
 } bioscall_vector;			/* bios jump vector */
 
 void
 set_bios_selectors(struct bios_segments *seg, int flags)
 {
     struct soft_segment_descriptor ssd = {
 	0,			/* segment base address (overwritten) */
 	0,			/* length (overwritten) */
 	SDT_MEMERA,		/* segment type (overwritten) */
 	0,			/* priority level */
 	1,			/* descriptor present */
 	0, 0,
 	1,			/* descriptor size (overwritten) */
 	0			/* granularity == byte units */
     };
     union descriptor *p_gdt;
 
 #ifdef SMP
     p_gdt = &gdt[PCPU_GET(cpuid) * NGDT];
 #else
     p_gdt = gdt;
 #endif
 	
     ssd.ssd_base = seg->code32.base;
     ssd.ssd_limit = seg->code32.limit;
     ssdtosd(&ssd, &p_gdt[GBIOSCODE32_SEL].sd);
 
     ssd.ssd_def32 = 0;
     if (flags & BIOSCODE_FLAG) {
 	ssd.ssd_base = seg->code16.base;
 	ssd.ssd_limit = seg->code16.limit;
 	ssdtosd(&ssd, &p_gdt[GBIOSCODE16_SEL].sd);
     }
 
     ssd.ssd_type = SDT_MEMRWA;
     if (flags & BIOSDATA_FLAG) {
 	ssd.ssd_base = seg->data.base;
 	ssd.ssd_limit = seg->data.limit;
 	ssdtosd(&ssd, &p_gdt[GBIOSDATA_SEL].sd);
     }
 
     if (flags & BIOSUTIL_FLAG) {
 	ssd.ssd_base = seg->util.base;
 	ssd.ssd_limit = seg->util.limit;
 	ssdtosd(&ssd, &p_gdt[GBIOSUTIL_SEL].sd);
     }
 
     if (flags & BIOSARGS_FLAG) {
 	ssd.ssd_base = seg->args.base;
 	ssd.ssd_limit = seg->args.limit;
 	ssdtosd(&ssd, &p_gdt[GBIOSARGS_SEL].sd);
     }
 }
 
 extern int vm86pa;
+extern u_long vm86phystk;
 extern void bios16_jmp(void);
 
 /*
  * this routine is really greedy with selectors, and uses 5:
  *
  * 32-bit code selector:	to return to kernel
  * 16-bit code selector:	for running code
  *        data selector:	for 16-bit data
  *        util selector:	extra utility selector
  *        args selector:	to handle pointers
  *
  * the util selector is set from the util16 entry in bios16_args, if a
  * "U" specifier is seen.
  *
  * See <machine/pc/bios.h> for description of format specifiers
  */
 int
 bios16(struct bios_args *args, char *fmt, ...)
 {
     char	*p, *stack, *stack_top;
     va_list 	ap;
     int 	flags = BIOSCODE_FLAG | BIOSDATA_FLAG;
     u_int 	i, arg_start, arg_end;
     pt_entry_t	*pte;
-    pd_entry_t	*ptd;
+    pd_entry_t	*ptd, orig_ptd;
 
     arg_start = 0xffffffff;
     arg_end = 0;
 
     /*
      * Some BIOS entrypoints attempt to copy the largest-case
      * argument frame (in order to generalise handling for 
      * different entry types).  If our argument frame is 
      * smaller than this, the BIOS will reach off the top of
      * our constructed stack segment.  Pad the top of the stack
      * with some garbage to avoid this.
      */
     stack = (caddr_t)PAGE_SIZE - 32;
 
     va_start(ap, fmt);
     for (p = fmt; p && *p; p++) {
 	switch (*p) {
 	case 'p':			/* 32-bit pointer */
 	    i = va_arg(ap, u_int);
 	    arg_start = min(arg_start, i);
 	    arg_end = max(arg_end, i);
 	    flags |= BIOSARGS_FLAG;
 	    stack -= 4;
 	    break;
 
 	case 'i':			/* 32-bit integer */
 	    i = va_arg(ap, u_int);
 	    stack -= 4;
 	    break;
 
 	case 'U':			/* 16-bit selector */
 	    flags |= BIOSUTIL_FLAG;
 	    /* FALLTHROUGH */
 	case 'D':			/* 16-bit selector */
 	case 'C':			/* 16-bit selector */
 	    stack -= 2;
 	    break;
 	    
 	case 's':			/* 16-bit integer passed as an int */
 	    i = va_arg(ap, int);
 	    stack -= 2;
 	    break;
 
 	default:
 	    va_end(ap);
 	    return (EINVAL);
 	}
     }
     va_end(ap);
 
     if (flags & BIOSARGS_FLAG) {
 	if (arg_end - arg_start > ctob(16))
 	    return (EACCES);
 	args->seg.args.base = arg_start;
 	args->seg.args.limit = 0xffff;
     }
 
     args->seg.code32.base = (u_int)&bios16_jmp & PG_FRAME;
     args->seg.code32.limit = 0xffff;	
 
-    ptd = (pd_entry_t *)rcr3();
-#if defined(PAE) || defined(PAE_TABLES)
-    if (ptd == IdlePDPT)
-#else
-    if (ptd == IdlePTD)
-#endif
-    {
-	/*
-	 * no page table, so create one and install it.
-	 */
-	pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
-	ptd = (pd_entry_t *)((u_int)IdlePTD + KERNBASE);
-	*pte = (vm86pa - PAGE_SIZE) | PG_RW | PG_V;
-	*ptd = vtophys(pte) | PG_RW | PG_V;
-    } else {
-	/*
-	 * this is a user-level page table 
-	 */
-	pte = PTmap;
-	*pte = (vm86pa - PAGE_SIZE) | PG_RW | PG_V;
-    }
+    /*
+     * no page table, so create one and install it.
+     */
+    pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
+    ptd = IdlePTD;
+    *pte = vm86phystk | PG_RW | PG_V;
+    orig_ptd = *ptd;
+    *ptd = vtophys(pte) | PG_RW | PG_V;
     pmap_invalidate_all(kernel_pmap);	/* XXX insurance for now */
 
     stack_top = stack;
     va_start(ap, fmt);
     for (p = fmt; p && *p; p++) {
 	switch (*p) {
 	case 'p':			/* 32-bit pointer */
 	    i = va_arg(ap, u_int);
 	    *(u_int *)stack = (i - arg_start) |
 		(GSEL(GBIOSARGS_SEL, SEL_KPL) << 16);
 	    stack += 4;
 	    break;
 
 	case 'i':			/* 32-bit integer */
 	    i = va_arg(ap, u_int);
 	    *(u_int *)stack = i;
 	    stack += 4;
 	    break;
 
 	case 'U':			/* 16-bit selector */
 	    *(u_short *)stack = GSEL(GBIOSUTIL_SEL, SEL_KPL);
 	    stack += 2;
 	    break;
 
 	case 'D':			/* 16-bit selector */
 	    *(u_short *)stack = GSEL(GBIOSDATA_SEL, SEL_KPL);
 	    stack += 2;
 	    break;
 
 	case 'C':			/* 16-bit selector */
 	    *(u_short *)stack = GSEL(GBIOSCODE16_SEL, SEL_KPL);
 	    stack += 2;
 	    break;
 
 	case 's':			/* 16-bit integer passed as an int */
 	    i = va_arg(ap, int);
 	    *(u_short *)stack = i;
 	    stack += 2;
 	    break;
 
 	default:
 	    va_end(ap);
 	    return (EINVAL);
 	}
     }
     va_end(ap);
 
     set_bios_selectors(&args->seg, flags);
     bioscall_vector.vec16.offset = (u_short)args->entry;
     bioscall_vector.vec16.segment = GSEL(GBIOSCODE16_SEL, SEL_KPL);
 
     i = bios16_call(&args->r, stack_top);
 
-    if (pte == PTmap) {
-	*pte = 0;			/* remove entry */
-	/*
-	 * XXX only needs to be invlpg(0) but that doesn't work on the 386 
-	 */
-	pmap_invalidate_all(kernel_pmap);
-    } else {
-	*ptd = 0;			/* remove page table */
-	/*
-	 * XXX only needs to be invlpg(0) but that doesn't work on the 386 
-	 */
-	pmap_invalidate_all(kernel_pmap);
-	free(pte, M_TEMP);		/* ... and free it */
-    }
+    *ptd = orig_ptd;		/* remove page table */
+    /*
+     * XXX only needs to be invlpg(0) but that doesn't work on the 386
+     */
+    pmap_invalidate_all(kernel_pmap);
+    free(pte, M_TEMP);		/* ... and free it */
     return (i);
 }
 
 int
 bios_oem_strings(struct bios_oem *oem, u_char *buffer, size_t maxlen)
 {
 	size_t idx = 0;
 	struct bios_oem_signature *sig;
 	u_int from, to;
 	u_char c, *s, *se, *str, *bios_str;
 	size_t i, off, len, tot;
 
 	if ( !oem || !buffer || maxlen<2 )
 		return(-1);
 
 	sig = oem->signature;
 	if (!sig)
 		return(-2);
 
 	from = oem->range.from;
 	to = oem->range.to;
 	if ( (to<=from) || (from<BIOS_START) || (to>(BIOS_START+BIOS_SIZE)) )
 		return(-3);
 
 	while (sig->anchor != NULL) {
 		str = sig->anchor;
 		len = strlen(str);
 		off = sig->offset;
 		tot = sig->totlen;
 		/* make sure offset doesn't go beyond bios area */
 		if ( (to+off)>(BIOS_START+BIOS_SIZE) ||
 					((from+off)<BIOS_START) ) {
 			printf("sys/i386/i386/bios.c: sig '%s' "
 				"from 0x%0x to 0x%0x offset %d "
 				"out of BIOS bounds 0x%0x - 0x%0x\n",
 				str, from, to, off,
 				BIOS_START, BIOS_START+BIOS_SIZE);
 			return(-4);
 		}
 		/* make sure we don't overrun return buffer */
 		if (idx + tot > maxlen - 1) {
 			printf("sys/i386/i386/bios.c: sig '%s' "
 				"idx %d + tot %d = %d > maxlen-1 %d\n",
 				str, idx, tot, idx+tot, maxlen-1);
 			return(-5);
 		}
 		bios_str = NULL;
 		s = (u_char *)BIOS_PADDRTOVADDR(from);
 		se = (u_char *)BIOS_PADDRTOVADDR(to-len);
 		for (; s<se; s++) {
 			if (!bcmp(str, s, len)) {
 				bios_str = s;
 				break;
 			}
 		}
 		/*
 		*  store pretty version of totlen bytes of bios string with
 		*  given offset; 0x20 - 0x7E are printable; uniquify spaces
 		*/
 		if (bios_str) {
 			for (i=0; i<tot; i++) {
 				c = bios_str[i+off];
 				if ( (c < 0x20) || (c > 0x7E) )
 					c = ' ';
 				if (idx == 0) {
 					if (c != ' ')
 						buffer[idx++] = c;
 				} else if ( (c != ' ') ||
 					((c == ' ') && (buffer[idx-1] != ' ')) )
 						buffer[idx++] = c;
 			}
 		}
 		sig++;
 	}
 	/* remove a final trailing space */
 	if ( (idx > 1) && (buffer[idx-1] == ' ') )
 		idx--;
 	buffer[idx] = '\0';
 	return (idx);
 }
 
 #ifdef DEV_ISA
 /*
  * PnP BIOS interface; enumerate devices only known to the system
  * BIOS and save information about them for later use.
  */
 
 struct pnp_sysdev 
 {
     u_int16_t	size;
     u_int8_t	handle;
     u_int32_t	devid;
     u_int8_t	type[3];
     u_int16_t	attrib;
 #define PNPATTR_NODISABLE	(1<<0)	/* can't be disabled */
 #define PNPATTR_NOCONFIG	(1<<1)	/* can't be configured */
 #define PNPATTR_OUTPUT		(1<<2)	/* can be primary output */
 #define PNPATTR_INPUT		(1<<3)	/* can be primary input */
 #define PNPATTR_BOOTABLE	(1<<4)	/* can be booted from */
 #define PNPATTR_DOCK		(1<<5)	/* is a docking station */
 #define PNPATTR_REMOVEABLE	(1<<6)	/* device is removeable */
 #define PNPATTR_CONFIG_STATIC	(0)
 #define PNPATTR_CONFIG_DYNAMIC	(1)
 #define PNPATTR_CONFIG_DYNONLY	(3)
 #define PNPATTR_CONFIG(a)	(((a) >> 7) & 0x3)
     /* device-specific data comes here */
     u_int8_t	devdata[0];
 } __packed;
 
 /* We have to cluster arguments within a 64k range for the bios16 call */
 struct pnp_sysdevargs
 {
     u_int16_t	next;
     struct pnp_sysdev node;
 };
 
 /*
  * This function is called after the bus has assigned resource
  * locations for a logical device.
  */
 static void
 pnpbios_set_config(void *arg, struct isa_config *config, int enable)
 {
 }
 
 /*
  * Quiz the PnP BIOS, build a list of PNP IDs and resource data.
  */
 static void
 pnpbios_identify(driver_t *driver, device_t parent)
 {
     struct PnPBIOS_table	*pt = PnPBIOStable;
     struct bios_args		args;
     struct pnp_sysdev		*pd;
     struct pnp_sysdevargs	*pda;
     u_int16_t			ndevs, bigdev;
     int				error, currdev;
     u_int8_t			*devnodebuf, tag;
     u_int32_t			*devid, *compid;
     int				idx, left;
     device_t			dev;
         
     /* no PnP BIOS information */
     if (pt == NULL)
 	return;
 
     /* Check to see if ACPI is already active. */
     dev = devclass_get_device(devclass_find("acpi"), 0);
     if (dev != NULL && device_is_attached(dev)) 
 	return;
 
     /* get count of PnP devices */
     bzero(&args, sizeof(args));
     args.seg.code16.base = BIOS_PADDRTOVADDR(pt->pmentrybase);
     args.seg.code16.limit = 0xffff;		/* XXX ? */
     args.seg.data.base = BIOS_PADDRTOVADDR(pt->pmdataseg);
     args.seg.data.limit = 0xffff;
     args.entry = pt->pmentryoffset;
     
     if ((error = bios16(&args, PNP_COUNT_DEVNODES, &ndevs, &bigdev)) || (args.r.eax & 0xff)) {
 	printf("pnpbios: error %d/%x getting device count/size limit\n", error, args.r.eax);
 	return;
     }
     ndevs &= 0xff;				/* clear high byte garbage */
     if (bootverbose)
 	printf("pnpbios: %d devices, largest %d bytes\n", ndevs, bigdev);
 
     devnodebuf = malloc(bigdev + (sizeof(struct pnp_sysdevargs) - sizeof(struct pnp_sysdev)),
 			M_DEVBUF, M_NOWAIT);
     if (devnodebuf == NULL) {
 	printf("pnpbios: cannot allocate memory, bailing\n");
 	return;
     }
     pda = (struct pnp_sysdevargs *)devnodebuf;
     pd = &pda->node;
 
     for (currdev = 0, left = ndevs; (currdev != 0xff) && (left > 0); left--) {
 
 	bzero(pd, bigdev);
 	pda->next = currdev;
 	/* get current configuration */
 	if ((error = bios16(&args, PNP_GET_DEVNODE, &pda->next, &pda->node, 1))) {
 	    printf("pnpbios: error %d making BIOS16 call\n", error);
 	    break;
 	}
 	if ((error = (args.r.eax & 0xff))) {
 	    if (bootverbose)
 		printf("pnpbios: %s 0x%x fetching node %d\n", error & 0x80 ? "error" : "warning", error, currdev);
 	    if (error & 0x80) 
 		break;
 	}
 	currdev = pda->next;
 	if (pd->size < sizeof(struct pnp_sysdev)) {
 	    printf("pnpbios: bogus system node data, aborting scan\n");
 	    break;
 	}
 
 	/*
 	 * Ignore PICs so that we don't have to worry about the PICs
 	 * claiming IRQs to prevent their use.  The PIC drivers
 	 * already ensure that invalid IRQs are not used.
 	 */
 	if (!strcmp(pnp_eisaformat(pd->devid), "PNP0000"))	/* ISA PIC */
 	    continue;
 	if (!strcmp(pnp_eisaformat(pd->devid), "PNP0003"))	/* APIC */
 	    continue;
 	
 	/* Add the device and parse its resources */
 	dev = BUS_ADD_CHILD(parent, ISA_ORDER_PNPBIOS, NULL, -1);
 	isa_set_vendorid(dev, pd->devid);
 	isa_set_logicalid(dev, pd->devid);
 	/*
 	 * It appears that some PnP BIOS doesn't allow us to re-enable
 	 * the embedded system device once it is disabled.  We shall
 	 * mark all system device nodes as "cannot be disabled", regardless
 	 * of actual settings in the device attribute byte.
 	 * XXX
 	isa_set_configattr(dev, 
 	    ((pd->attrib & PNPATTR_NODISABLE) ?  0 : ISACFGATTR_CANDISABLE) |
 	    ((!(pd->attrib & PNPATTR_NOCONFIG) && 
 		PNPATTR_CONFIG(pd->attrib) != PNPATTR_CONFIG_STATIC)
 		? ISACFGATTR_DYNAMIC : 0));
 	 */
 	isa_set_configattr(dev, 
 	    (!(pd->attrib & PNPATTR_NOCONFIG) && 
 		PNPATTR_CONFIG(pd->attrib) != PNPATTR_CONFIG_STATIC)
 		? ISACFGATTR_DYNAMIC : 0);
 	isa_set_pnpbios_handle(dev, pd->handle);
 	ISA_SET_CONFIG_CALLBACK(parent, dev, pnpbios_set_config, 0);
 	pnp_parse_resources(dev, &pd->devdata[0],
 	    pd->size - sizeof(struct pnp_sysdev), 0);
 	if (!device_get_desc(dev))
 	    device_set_desc_copy(dev, pnp_eisaformat(pd->devid));
 
 	/* Find device IDs */
 	devid = &pd->devid;
 	compid = NULL;
 
 	/* look for a compatible device ID too */
 	left = pd->size - sizeof(struct pnp_sysdev);
 	idx = 0;
 	while (idx < left) {
 	    tag = pd->devdata[idx++];
 	    if (PNP_RES_TYPE(tag) == 0) {
 		/* Small resource */
 		switch (PNP_SRES_NUM(tag)) {
 		case PNP_TAG_COMPAT_DEVICE:
 		    compid = (u_int32_t *)(pd->devdata + idx);
 		    if (bootverbose)
 			printf("pnpbios: node %d compat ID 0x%08x\n", pd->handle, *compid);
 		    /* FALLTHROUGH */
 		case PNP_TAG_END:
 		    idx = left;
 		    break;
 		default:
 		    idx += PNP_SRES_LEN(tag);
 		    break;
 		}
 	    } else
 		/* Large resource, skip it */
 		idx += *(u_int16_t *)(pd->devdata + idx) + 2;
 	}
 	if (bootverbose) {
 	    printf("pnpbios: handle %d device ID %s (%08x)", 
 		   pd->handle, pnp_eisaformat(*devid), *devid);
 	    if (compid != NULL)
 		printf(" compat ID %s (%08x)",
 		       pnp_eisaformat(*compid), *compid);
 	    printf("\n");
 	}
     }
 }
 
 static device_method_t pnpbios_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	pnpbios_identify),
 
 	{ 0, 0 }
 };
 
 static driver_t pnpbios_driver = {
 	"pnpbios",
 	pnpbios_methods,
 	1,			/* no softc */
 };
 
 static devclass_t pnpbios_devclass;
 
 DRIVER_MODULE(pnpbios, isa, pnpbios_driver, pnpbios_devclass, 0, 0);
 #endif /* DEV_ISA */
Index: head/sys/i386/i386/copyout.c
===================================================================
--- head/sys/i386/i386/copyout.c	(nonexistent)
+++ head/sys/i386/i386/copyout.c	(revision 332489)
@@ -0,0 +1,489 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
+#if defined(PAE) || defined(PAE_TABLES)
+#define	KCR3	((u_int)IdlePDPT)
+#else
+#define	KCR3	((u_int)IdlePTD)
+#endif
+
+int copyin_fast(const void *udaddr, void *kaddr, size_t len, u_int);
+static int (*copyin_fast_tramp)(const void *, void *, size_t, u_int);
+int copyout_fast(const void *kaddr, void *udaddr, size_t len, u_int);
+static int (*copyout_fast_tramp)(const void *, void *, size_t, u_int);
+int fubyte_fast(volatile const void *base, u_int kcr3);
+static int (*fubyte_fast_tramp)(volatile const void *, u_int);
+int fuword16_fast(volatile const void *base, u_int kcr3);
+static int (*fuword16_fast_tramp)(volatile const void *, u_int);
+int fueword_fast(volatile const void *base, long *val, u_int kcr3);
+static int (*fueword_fast_tramp)(volatile const void *, long *, u_int);
+int subyte_fast(volatile void *base, int val, u_int kcr3);
+static int (*subyte_fast_tramp)(volatile void *, int, u_int);
+int suword16_fast(volatile void *base, int val, u_int kcr3);
+static int (*suword16_fast_tramp)(volatile void *, int, u_int);
+int suword_fast(volatile void *base, long val, u_int kcr3);
+static int (*suword_fast_tramp)(volatile void *, long, u_int);
+
+static int fast_copyout = 1;
+SYSCTL_INT(_machdep, OID_AUTO, fast_copyout, CTLFLAG_RWTUN,
+    &fast_copyout, 0,
+    "");
+
+void
+copyout_init_tramp(void)
+{
+
+	copyin_fast_tramp = (int (*)(const void *, void *, size_t, u_int))(
+	    (uintptr_t)copyin_fast + setidt_disp);
+	copyout_fast_tramp = (int (*)(const void *, void *, size_t, u_int))(
+	    (uintptr_t)copyout_fast + setidt_disp);
+	fubyte_fast_tramp = (int (*)(volatile const void *, u_int))(
+	    (uintptr_t)fubyte_fast + setidt_disp);
+	fuword16_fast_tramp = (int (*)(volatile const void *, u_int))(
+	    (uintptr_t)fuword16_fast + setidt_disp);
+	fueword_fast_tramp = (int (*)(volatile const void *, long *, u_int))(
+	    (uintptr_t)fueword_fast + setidt_disp);
+	subyte_fast_tramp = (int (*)(volatile void *, int, u_int))(
+	    (uintptr_t)subyte_fast + setidt_disp);
+	suword16_fast_tramp = (int (*)(volatile void *, int, u_int))(
+	    (uintptr_t)suword16_fast + setidt_disp);
+	suword_fast_tramp = (int (*)(volatile void *, long, u_int))(
+	    (uintptr_t)suword_fast + setidt_disp);
+}
+
+static int
+cp_slow0(vm_offset_t uva, size_t len, bool write,
+    void (*f)(vm_offset_t, void *), void *arg)
+{
+	struct pcpu *pc;
+	vm_page_t m[2];
+	pt_entry_t *pte;
+	vm_offset_t kaddr;
+	int error, i, plen;
+	bool sleepable;
+
+	plen = howmany(uva - trunc_page(uva) + len, PAGE_SIZE);
+	MPASS(plen <= nitems(m));
+	error = 0;
+	i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, uva, len,
+	    (write ? VM_PROT_WRITE : VM_PROT_READ) | VM_PROT_QUICK_NOFAULT,
+	    m, nitems(m));
+	if (i != plen)
+		return (EFAULT);
+	sched_pin();
+	pc = get_pcpu();
+	if (!THREAD_CAN_SLEEP() || curthread->td_vslock_sz > 0 ||
+	    (curthread->td_pflags & TDP_NOFAULTING) != 0) {
+		sleepable = false;
+		mtx_lock(&pc->pc_copyout_mlock);
+		kaddr = pc->pc_copyout_maddr;
+	} else {
+		sleepable = true;
+		sx_xlock(&pc->pc_copyout_slock);
+		kaddr = pc->pc_copyout_saddr;
+	}
+	for (i = 0, pte = vtopte(kaddr); i < plen; i++, pte++) {
+		*pte = PG_V | PG_RW | PG_A | PG_M | VM_PAGE_TO_PHYS(m[i]) |
+		    pmap_cache_bits(pmap_page_get_memattr(m[i]), FALSE);
+		invlpg(kaddr + ptoa(i));
+	}
+	kaddr += uva - trunc_page(uva);
+	f(kaddr, arg);
+	sched_unpin();
+	if (sleepable)
+		sx_xunlock(&pc->pc_copyout_slock);
+	else
+		mtx_unlock(&pc->pc_copyout_mlock);
+	for (i = 0; i < plen; i++) {
+		vm_page_lock(m[i]);
+		vm_page_unhold(m[i]);
+		vm_page_unlock(m[i]);
+	}
+	return (error);
+}
+
+struct copyinstr_arg0 {
+	vm_offset_t kc;
+	size_t len;
+	size_t alen;
+	bool end;
+};
+
+static void
+copyinstr_slow0(vm_offset_t kva, void *arg)
+{
+	struct copyinstr_arg0 *ca;
+	char c;
+
+	ca = arg;
+	MPASS(ca->alen == 0 && ca->len > 0 && !ca->end);
+	while (ca->alen < ca->len && !ca->end) {
+		c = *(char *)(kva + ca->alen);
+		*(char *)ca->kc = c;
+		ca->alen++;
+		ca->kc++;
+		if (c == '\0')
+			ca->end = true;
+	}
+}
+
+int
+copyinstr(const void *udaddr, void *kaddr, size_t maxlen, size_t *lencopied)
+{
+	struct copyinstr_arg0 ca;
+	vm_offset_t uc;
+	size_t plen;
+	int error;
+
+	error = 0;
+	ca.end = false;
+	for (plen = 0, uc = (vm_offset_t)udaddr, ca.kc = (vm_offset_t)kaddr;
+	    plen < maxlen && !ca.end; uc += ca.alen, plen += ca.alen) {
+		ca.len = round_page(uc) - uc;
+		if (ca.len == 0)
+			ca.len = PAGE_SIZE;
+		if (plen + ca.len > maxlen)
+			ca.len = maxlen - plen;
+		ca.alen = 0;
+		if (cp_slow0(uc, ca.len, false, copyinstr_slow0, &ca) != 0) {
+			error = EFAULT;
+			break;
+		}
+	}
+	if (!ca.end && plen == maxlen && error == 0)
+		error = ENAMETOOLONG;
+	if (lencopied != NULL)
+		*lencopied = plen;
+	return (error);
+}
+
+struct copyin_arg0 {
+	vm_offset_t kc;
+	size_t len;
+};
+
+static void
+copyin_slow0(vm_offset_t kva, void *arg)
+{
+	struct copyin_arg0 *ca;
+
+	ca = arg;
+	bcopy((void *)kva, (void *)ca->kc, ca->len);
+}
+
+int
+copyin(const void *udaddr, void *kaddr, size_t len)
+{
+	struct copyin_arg0 ca;
+	vm_offset_t uc;
+	size_t plen;
+
+	if ((uintptr_t)udaddr + len < (uintptr_t)udaddr ||
+	    (uintptr_t)udaddr + len > VM_MAXUSER_ADDRESS)
+		return (-1);
+	if (len == 0 || (fast_copyout && len <= TRAMP_COPYOUT_SZ &&
+	    copyin_fast_tramp(udaddr, kaddr, len, KCR3) == 0))
+		return (0);
+	for (plen = 0, uc = (vm_offset_t)udaddr, ca.kc = (vm_offset_t)kaddr;
+	    plen < len; uc += ca.len, ca.kc += ca.len, plen += ca.len) {
+		ca.len = round_page(uc) - uc;
+		if (ca.len == 0)
+			ca.len = PAGE_SIZE;
+		if (plen + ca.len > len)
+			ca.len = len - plen;
+		if (cp_slow0(uc, ca.len, false, copyin_slow0, &ca) != 0)
+			return (EFAULT);
+	}
+	return (0);
+}
+
+static void
+copyout_slow0(vm_offset_t kva, void *arg)
+{
+	struct copyin_arg0 *ca;
+
+	ca = arg;
+	bcopy((void *)ca->kc, (void *)kva, ca->len);
+}
+
+int
+copyout(const void *kaddr, void *udaddr, size_t len)
+{
+	struct copyin_arg0 ca;
+	vm_offset_t uc;
+	size_t plen;
+
+	if ((uintptr_t)udaddr + len < (uintptr_t)udaddr ||
+	    (uintptr_t)udaddr + len > VM_MAXUSER_ADDRESS)
+		return (-1);
+	if (len == 0 || (fast_copyout && len <= TRAMP_COPYOUT_SZ &&
+	    copyout_fast_tramp(kaddr, udaddr, len, KCR3) == 0))
+		return (0);
+	for (plen = 0, uc = (vm_offset_t)udaddr, ca.kc = (vm_offset_t)kaddr;
+	    plen < len; uc += ca.len, ca.kc += ca.len, plen += ca.len) {
+		ca.len = round_page(uc) - uc;
+		if (ca.len == 0)
+			ca.len = PAGE_SIZE;
+		if (plen + ca.len > len)
+			ca.len = len - plen;
+		if (cp_slow0(uc, ca.len, true, copyout_slow0, &ca) != 0)
+			return (EFAULT);
+	}
+	return (0);
+}
+
+/*
+ * Fetch (load) a 32-bit word, a 16-bit word, or an 8-bit byte from user
+ * memory.
+ */
+
+static void
+fubyte_slow0(vm_offset_t kva, void *arg)
+{
+
+	*(int *)arg = *(u_char *)kva;
+}
+
+int
+fubyte(volatile const void *base)
+{
+	int res;
+
+	if ((uintptr_t)base + sizeof(uint8_t) < (uintptr_t)base ||
+	    (uintptr_t)base + sizeof(uint8_t) > VM_MAXUSER_ADDRESS)
+		return (-1);
+	if (fast_copyout) {
+		res = fubyte_fast_tramp(base, KCR3);
+		if (res != -1)
+			return (res);
+	}
+	if (cp_slow0((vm_offset_t)base, sizeof(char), false, fubyte_slow0,
+	    &res) != 0)
+		return (-1);
+	return (res);
+}
+
+static void
+fuword16_slow0(vm_offset_t kva, void *arg)
+{
+
+	*(int *)arg = *(uint16_t *)kva;
+}
+
+int
+fuword16(volatile const void *base)
+{
+	int res;
+
+	if ((uintptr_t)base + sizeof(uint16_t) < (uintptr_t)base ||
+	    (uintptr_t)base + sizeof(uint16_t) > VM_MAXUSER_ADDRESS)
+		return (-1);
+	if (fast_copyout) {
+		res = fuword16_fast_tramp(base, KCR3);
+		if (res != -1)
+			return (res);
+	}
+	if (cp_slow0((vm_offset_t)base, sizeof(uint16_t), false,
+	    fuword16_slow0, &res) != 0)
+		return (-1);
+	return (res);
+}
+
+static void
+fueword_slow0(vm_offset_t kva, void *arg)
+{
+
+	*(uint32_t *)arg = *(uint32_t *)kva;
+}
+
+int
+fueword(volatile const void *base, long *val)
+{
+	uint32_t res;
+
+	if ((uintptr_t)base + sizeof(*val) < (uintptr_t)base ||
+	    (uintptr_t)base + sizeof(*val) > VM_MAXUSER_ADDRESS)
+		return (-1);
+	if (fast_copyout) {
+		if (fueword_fast_tramp(base, val, KCR3) == 0)
+			return (0);
+	}
+	if (cp_slow0((vm_offset_t)base, sizeof(long), false, fueword_slow0,
+	    &res) != 0)
+		return (-1);
+	*val = res;
+	return (0);
+}
+
+int
+fueword32(volatile const void *base, int32_t *val)
+{
+
+	return (fueword(base, (long *)val));
+}
+
+/*
+ * Store a 32-bit word, a 16-bit word, or an 8-bit byte to user memory.
+ */
+
+static void
+subyte_slow0(vm_offset_t kva, void *arg)
+{
+
+	*(u_char *)kva = *(int *)arg;
+}
+
+int
+subyte(volatile void *base, int byte)
+{
+
+	if ((uintptr_t)base + sizeof(uint8_t) < (uintptr_t)base ||
+	    (uintptr_t)base + sizeof(uint8_t) > VM_MAXUSER_ADDRESS)
+		return (-1);
+	if (fast_copyout && subyte_fast_tramp(base, byte, KCR3) == 0)
+		return (0);
+	return (cp_slow0((vm_offset_t)base, sizeof(u_char), true, subyte_slow0,
+	    &byte) != 0 ? -1 : 0);
+}
+
+static void
+suword16_slow0(vm_offset_t kva, void *arg)
+{
+
+	*(int *)kva = *(uint16_t *)arg;
+}
+
+int
+suword16(volatile void *base, int word)
+{
+
+	if ((uintptr_t)base + sizeof(uint16_t) < (uintptr_t)base ||
+	    (uintptr_t)base + sizeof(uint16_t) > VM_MAXUSER_ADDRESS)
+		return (-1);
+	if (fast_copyout && suword16_fast_tramp(base, word, KCR3) == 0)
+		return (0);
+	return (cp_slow0((vm_offset_t)base, sizeof(int16_t), true,
+	    suword16_slow0, &word) != 0 ? -1 : 0);
+}
+
+static void
+suword_slow0(vm_offset_t kva, void *arg)
+{
+
+	*(int *)kva = *(uint32_t *)arg;
+}
+
+int
+suword(volatile void *base, long word)
+{
+
+	if ((uintptr_t)base + sizeof(word) < (uintptr_t)base ||
+	    (uintptr_t)base + sizeof(word) > VM_MAXUSER_ADDRESS)
+		return (-1);
+	if (fast_copyout && suword_fast_tramp(base, word, KCR3) == 0)
+		return (0);
+	return (cp_slow0((vm_offset_t)base, sizeof(long), true,
+	    suword_slow0, &word) != 0 ? -1 : 0);
+}
+
+int
+suword32(volatile void *base, int32_t word)
+{
+
+	return (suword(base, word));
+}
+
+struct casueword_arg0 {
+	uint32_t oldval;
+	uint32_t newval;
+};
+
+static void
+casueword_slow0(vm_offset_t kva, void *arg)
+{
+	struct casueword_arg0 *ca;
+
+	ca = arg;
+	atomic_fcmpset_int((u_int *)kva, &ca->oldval, ca->newval);
+}
+
+int
+casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp,
+    uint32_t newval)
+{
+	struct casueword_arg0 ca;
+	int res;
+
+	ca.oldval = oldval;
+	ca.newval = newval;
+	res = cp_slow0((vm_offset_t)base, sizeof(int32_t), true,
+	    casueword_slow0, &ca);
+	if (res == 0) {
+		*oldvalp = ca.oldval;
+		return (0);
+	}
+	return (-1);
+}
+
+int
+casueword(volatile u_long *base, u_long oldval, u_long *oldvalp, u_long newval)
+{
+	struct casueword_arg0 ca;
+	int res;
+
+	ca.oldval = oldval;
+	ca.newval = newval;
+	res = cp_slow0((vm_offset_t)base, sizeof(int32_t), true,
+	    casueword_slow0, &ca);
+	if (res == 0) {
+		*oldvalp = ca.oldval;
+		return (0);
+	}
+	return (-1);
+}

Property changes on: head/sys/i386/i386/copyout.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: head/sys/i386/i386/copyout_fast.s
===================================================================
--- head/sys/i386/i386/copyout_fast.s	(nonexistent)
+++ head/sys/i386/i386/copyout_fast.s	(revision 332489)
@@ -0,0 +1,362 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+#include <machine/cputypes.h>
+#include <machine/pmap.h>
+#include <machine/specialreg.h>
+
+#include "assym.inc"
+
+	.text
+
+ENTRY(copyout_fast)
+	pushl	%ebp
+	movl	%esp, %ebp
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebx
+
+	movl	$copyout_fault,%edx
+	movl	20(%ebp),%ebx	/* KCR3 */
+
+	movl	PCPU(CURPCB),%eax
+	movl	PCB_CR3(%eax),%edi
+
+	cli
+	movl	PCPU(TRAMPSTK),%esi
+	movl	PCPU(COPYOUT_BUF),%eax
+	subl	$4,%esi
+	movl	%eax,(%esi)
+	movl	12(%ebp),%eax	/* udaddr */
+	subl	$4,%esi
+	movl	%eax,(%esi)
+	movl	16(%ebp),%eax	/* len */
+	subl	$4,%esi
+	movl	%eax,(%esi)
+
+	subl	$4, %esi
+	movl	%edi, (%esi)
+
+	movl	8(%ebp),%eax	/* kaddr */
+	subl	$4,%esi
+	movl	%eax,(%esi)
+	movl	PCPU(COPYOUT_BUF),%eax
+	subl	$4,%esi
+	movl	%eax,(%esi)
+	movl	16(%ebp),%eax	/* len */
+	subl	$4,%esi
+	movl	%eax,(%esi)
+
+	movl	%esp,%eax
+	movl	%esi,%esp
+
+	/* bcopy(%esi = kaddr, %edi = PCPU(copyout_buf), %ecx = len) */
+	popl	%ecx
+	popl	%edi
+	popl	%esi
+	rep; movsb
+
+	popl	%edi
+	movl	%edi,%cr3
+
+	/* bcopy(%esi = PCPU(copyout_buf), %edi = udaddr, %ecx = len) */
+	popl	%ecx
+	popl	%edi
+	popl	%esi
+	rep; movsb
+
+	movl	%ebx,%cr3
+	movl	%eax,%esp
+	sti
+
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	leave
+	ret
+END(copyout_fast)
+
+ENTRY(copyin_fast)
+	pushl	%ebp
+	movl	%esp, %ebp
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebx
+
+	movl	$copyout_fault,%edx
+	movl	20(%ebp),%ebx	/* KCR3 */
+
+	movl	PCPU(CURPCB),%eax
+	movl	PCB_CR3(%eax),%edi
+
+	cli
+	movl	PCPU(TRAMPSTK),%esi
+	movl	PCPU(COPYOUT_BUF),%eax
+	subl	$4,%esi
+	movl	%eax,(%esi)
+	movl	12(%ebp),%eax	/* kaddr */
+	subl	$4,%esi
+	movl	%eax,(%esi)
+	movl	16(%ebp),%eax	/* len */
+	subl	$4,%esi
+	movl	%eax,(%esi)
+
+	movl	8(%ebp),%eax	/* udaddr */
+	subl	$4,%esi
+	movl	%eax,(%esi)
+	movl	PCPU(COPYOUT_BUF),%eax
+	subl	$4,%esi
+	movl	%eax,(%esi)
+	movl	16(%ebp),%eax	/* len */
+	subl	$4,%esi
+	movl	%eax,(%esi)
+
+	movl	%esp,%eax
+	movl	%esi,%esp
+	movl	%edi,%cr3
+
+	/* bcopy(%esi = udaddr, %edi = PCPU(copyout_buf), %ecx = len) */
+	popl	%ecx
+	popl	%edi
+	popl	%esi
+	rep; movsb
+
+	movl	%ebx,%cr3
+
+	/* bcopy(%esi = PCPU(copyout_buf), %edi = kaddr, %ecx = len) */
+	popl	%ecx
+	popl	%edi
+	popl	%esi
+	rep; movsb
+
+	movl	%eax,%esp
+	sti
+
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	leave
+	ret
+END(copyin_fast)
+
+	ALIGN_TEXT
+copyout_fault:
+	movl	%eax,%esp
+	sti
+	movl	$EFAULT,%eax
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	leave
+	ret
+
+ENTRY(fueword_fast)
+	pushl	%ebp
+	movl	%esp,%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	8(%ebp),%ecx			/* from */
+	movl	PCPU(CURPCB),%eax
+	movl	PCB_CR3(%eax),%eax
+	movl	$fusufault,%edx
+	movl	16(%ebp),%ebx
+	movl	%esp,%esi
+	cli
+	movl	PCPU(TRAMPSTK),%esp
+	movl	%eax,%cr3
+	movl	(%ecx),%eax
+	movl	%ebx,%cr3
+	movl	%esi,%esp
+	sti
+	movl	12(%ebp),%edx
+	movl	%eax,(%edx)
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	leave
+	ret
+END(fueword_fast)
+
+ENTRY(fuword16_fast)
+	pushl	%ebp
+	movl	%esp,%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	8(%ebp),%ecx			/* from */
+	movl	PCPU(CURPCB),%eax
+	movl	PCB_CR3(%eax),%eax
+	movl	$fusufault,%edx
+	movl	12(%ebp),%ebx
+	movl	%esp,%esi
+	cli
+	movl	PCPU(TRAMPSTK),%esp
+	movl	%eax,%cr3
+	movzwl	(%ecx),%eax
+	movl	%ebx,%cr3
+	movl	%esi,%esp
+	sti
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	leave
+	ret
+END(fuword16_fast)
+
+ENTRY(fubyte_fast)
+	pushl	%ebp
+	movl	%esp,%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	8(%ebp),%ecx			/* from */
+	movl	PCPU(CURPCB),%eax
+	movl	PCB_CR3(%eax),%eax
+	movl	$fusufault,%edx
+	movl	12(%ebp),%ebx
+	movl	%esp,%esi
+	cli
+	movl	PCPU(TRAMPSTK),%esp
+	movl	%eax,%cr3
+	movzbl	(%ecx),%eax
+	movl	%ebx,%cr3
+	movl	%esi,%esp
+	sti
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	leave
+	ret
+END(fubyte_fast)
+
+	ALIGN_TEXT
+fusufault:
+	movl	%esi,%esp
+	sti
+	xorl	%eax,%eax
+	decl	%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	leave
+	ret
+
+ENTRY(suword_fast)
+	pushl	%ebp
+	movl	%esp,%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	PCPU(CURPCB),%eax
+	movl	PCB_CR3(%eax),%eax
+	movl	$fusufault,%edx
+	movl	8(%ebp),%ecx			/* to */
+	movl	12(%ebp),%edi			/* val */
+	movl	16(%ebp),%ebx
+	movl	%esp,%esi
+	cli
+	movl	PCPU(TRAMPSTK),%esp
+	movl	%eax,%cr3
+	movl	%edi,(%ecx)
+	movl	%ebx,%cr3
+	movl	%esi,%esp
+	sti
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	leave
+	ret
+END(suword_fast)
+
+ENTRY(suword16_fast)
+	pushl	%ebp
+	movl	%esp,%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	PCPU(CURPCB),%eax
+	movl	PCB_CR3(%eax),%eax
+	movl	$fusufault,%edx
+	movl	8(%ebp),%ecx			/* to */
+	movl	12(%ebp),%edi			/* val */
+	movl	16(%ebp),%ebx
+	movl	%esp,%esi
+	cli
+	movl	PCPU(TRAMPSTK),%esp
+	movl	%eax,%cr3
+	movw	%di,(%ecx)
+	movl	%ebx,%cr3
+	movl	%esi,%esp
+	sti
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	leave
+	ret
+END(suword16_fast)
+
+ENTRY(subyte_fast)
+	pushl	%ebp
+	movl	%esp,%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	PCPU(CURPCB),%eax
+	movl	PCB_CR3(%eax),%eax
+	movl	$fusufault,%edx
+	movl	8(%ebp),%ecx			/* to */
+	movl	12(%ebp),%edi			/* val */
+	movl	16(%ebp),%ebx
+	movl	%esp,%esi
+	cli
+	movl	PCPU(TRAMPSTK),%esp
+	movl	%eax,%cr3
+	movl	%edi,%eax
+	movb	%al,(%ecx)
+	movl	%ebx,%cr3
+	movl	%esi,%esp
+	sti
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	leave
+	ret
+END(subyte_fast)

Property changes on: head/sys/i386/i386/copyout_fast.s
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Index: head/sys/i386/i386/db_interface.c
===================================================================
--- head/sys/i386/i386/db_interface.c	(revision 332488)
+++ head/sys/i386/i386/db_interface.c	(revision 332489)
@@ -1,118 +1,121 @@
 /*-
  * Mach Operating System
  * Copyright (c) 1991,1990 Carnegie Mellon University
  * All Rights Reserved.
  *
  * Permission to use, copy, modify and distribute this software and its
  * documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Interface to new debugger.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 
 #include <machine/psl.h>
 
 #include <ddb/ddb.h>
 
 /*
  * Read bytes from kernel address space for debugger.
  */
 int
 db_read_bytes(vm_offset_t addr, size_t size, char *data)
 {
 	jmp_buf jb;
 	void *prev_jb;
 	char *src;
 	int ret;
 
 	prev_jb = kdb_jmpbuf(jb);
 	ret = setjmp(jb);
 	if (ret == 0) {
 		src = (char *)addr;
 		while (size-- > 0)
 			*data++ = *src++;
 	}
 	(void)kdb_jmpbuf(prev_jb);
 	return (ret);
 }
 
 /*
  * Write bytes to kernel address space for debugger.
  */
 int
 db_write_bytes(vm_offset_t addr, size_t size, char *data)
 {
 	jmp_buf jb;
 	void *prev_jb;
 	char *dst;
 	int ret;
 
 	prev_jb = kdb_jmpbuf(jb);
 	ret = setjmp(jb);
 	if (ret == 0) {
 		dst = (char *)addr;
 		while (size-- > 0)
 			*dst++ = *data++;
 	}
 	(void)kdb_jmpbuf(prev_jb);
 	return (ret);
 }
 
 int
 db_segsize(struct trapframe *tfp)
 {
 	struct proc_ldt *plp;
 	struct segment_descriptor *sdp;
 	int sel;
 
 	if (tfp == NULL)
 	    return (32);
 	if (tfp->tf_eflags & PSL_VM)
 	    return (16);
 	sel = tfp->tf_cs & 0xffff;
 	if (sel == GSEL(GCODE_SEL, SEL_KPL))
 	    return (32);
 	/* Rare cases follow.  User mode cases are currently unreachable. */
 	if (ISLDT(sel)) {
 	    plp = curthread->td_proc->p_md.md_ldt;
 	    sdp = (plp != NULL) ? &plp->ldt_sd : &ldt[0].sd;
 	} else {
 	    sdp = &gdt[PCPU_GET(cpuid) * NGDT].sd;
 	}
 	return (sdp[IDXSEL(sel)].sd_def32 == 0 ? 16 : 32);
 }
 
 void
 db_show_mdpcpu(struct pcpu *pc)
 {
 
 	db_printf("APIC ID      = %d\n", pc->pc_apic_id);
 	db_printf("currentldt   = 0x%x\n", pc->pc_currentldt);
+	db_printf("trampstk     = 0x%x\n", pc->pc_trampstk);
+	db_printf("kesp0        = 0x%x\n", pc->pc_kesp0);
+	db_printf("common_tssp  = 0x%x\n", (u_int)pc->pc_common_tssp);
 }
Index: head/sys/i386/i386/db_trace.c
===================================================================
--- head/sys/i386/i386/db_trace.c	(revision 332488)
+++ head/sys/i386/i386/db_trace.c	(revision 332489)
@@ -1,766 +1,791 @@
 /*-
  * Mach Operating System
  * Copyright (c) 1991,1990 Carnegie Mellon University
  * All Rights Reserved.
  *
  * Permission to use, copy, modify and distribute this software and its
  * documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 
 #include <machine/cpu.h>
 #include <machine/frame.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/reg.h>
 #include <machine/stack.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 
 #include <ddb/ddb.h>
 #include <ddb/db_access.h>
 #include <ddb/db_sym.h>
 #include <ddb/db_variables.h>
 
 static db_varfcn_t db_esp;
 static db_varfcn_t db_frame;
 static db_varfcn_t db_frame_seg;
 static db_varfcn_t db_gs;
 static db_varfcn_t db_ss;
 
 /*
  * Machine register set.
  */
 #define	DB_OFFSET(x)	(db_expr_t *)offsetof(struct trapframe, x)
 struct db_variable db_regs[] = {
 	{ "cs",		DB_OFFSET(tf_cs),	db_frame_seg },
 	{ "ds",		DB_OFFSET(tf_ds),	db_frame_seg },
 	{ "es",		DB_OFFSET(tf_es),	db_frame_seg },
 	{ "fs",		DB_OFFSET(tf_fs),	db_frame_seg },
 	{ "gs",		NULL,			db_gs },
 	{ "ss",		NULL,			db_ss },
 	{ "eax",	DB_OFFSET(tf_eax),	db_frame },
 	{ "ecx",	DB_OFFSET(tf_ecx),	db_frame },
 	{ "edx",	DB_OFFSET(tf_edx),	db_frame },
 	{ "ebx",	DB_OFFSET(tf_ebx),	db_frame },
 	{ "esp",	NULL,			db_esp },
 	{ "ebp",	DB_OFFSET(tf_ebp),	db_frame },
 	{ "esi",	DB_OFFSET(tf_esi),	db_frame },
 	{ "edi",	DB_OFFSET(tf_edi),	db_frame },
 	{ "eip",	DB_OFFSET(tf_eip),	db_frame },
 	{ "efl",	DB_OFFSET(tf_eflags),	db_frame },
 };
 struct db_variable *db_eregs = db_regs + nitems(db_regs);
 
 static __inline int
 get_esp(struct trapframe *tf)
 {
 	return (TF_HAS_STACKREGS(tf) ? tf->tf_esp : (intptr_t)&tf->tf_esp);
 }
 
 static int
 db_frame(struct db_variable *vp, db_expr_t *valuep, int op)
 {
 	int *reg;
 
 	if (kdb_frame == NULL)
 		return (0);
 
 	reg = (int *)((uintptr_t)kdb_frame + (db_expr_t)vp->valuep);
 	if (op == DB_VAR_GET)
 		*valuep = *reg;
 	else
 		*reg = *valuep;
 	return (1);
 }
 
 static int
 db_frame_seg(struct db_variable *vp, db_expr_t *valuep, int op)
 {
 	struct trapframe_vm86 *tfp;
 	int off;
 	uint16_t *reg;
 
 	if (kdb_frame == NULL)
 		return (0);
 
 	off = (intptr_t)vp->valuep;
 	if (kdb_frame->tf_eflags & PSL_VM) {
 		tfp = (void *)kdb_frame;
 		switch ((intptr_t)vp->valuep) {
 		case (intptr_t)DB_OFFSET(tf_cs):
 			reg = (uint16_t *)&tfp->tf_cs;
 			break;
 		case (intptr_t)DB_OFFSET(tf_ds):
 			reg = (uint16_t *)&tfp->tf_vm86_ds;
 			break;
 		case (intptr_t)DB_OFFSET(tf_es):
 			reg = (uint16_t *)&tfp->tf_vm86_es;
 			break;
 		case (intptr_t)DB_OFFSET(tf_fs):
 			reg = (uint16_t *)&tfp->tf_vm86_fs;
 			break;
 		}
 	} else
 		reg = (uint16_t *)((uintptr_t)kdb_frame + off);
 	if (op == DB_VAR_GET)
 		*valuep = *reg;
 	else
 		*reg = *valuep;
 	return (1);
 }
 
 static int
 db_esp(struct db_variable *vp, db_expr_t *valuep, int op)
 {
 
 	if (kdb_frame == NULL)
 		return (0);
 
 	if (op == DB_VAR_GET)
 		*valuep = get_esp(kdb_frame);
 	else if (TF_HAS_STACKREGS(kdb_frame))
 		kdb_frame->tf_esp = *valuep;
 	return (1);
 }
 
 static int
 db_gs(struct db_variable *vp, db_expr_t *valuep, int op)
 {
 	struct trapframe_vm86 *tfp;
 
 	if (kdb_frame != NULL && kdb_frame->tf_eflags & PSL_VM) {
 		tfp = (void *)kdb_frame;
 		if (op == DB_VAR_GET)
 			*valuep = tfp->tf_vm86_gs;
 		else
 			tfp->tf_vm86_gs = *valuep;
 		return (1);
 	}
 	if (op == DB_VAR_GET)
 		*valuep = rgs();
 	else
 		load_gs(*valuep);
 	return (1);
 }
 
 static int
 db_ss(struct db_variable *vp, db_expr_t *valuep, int op)
 {
 
 	if (kdb_frame == NULL)
 		return (0);
 
 	if (op == DB_VAR_GET)
 		*valuep = TF_HAS_STACKREGS(kdb_frame) ? kdb_frame->tf_ss :
 		    rss();
 	else if (TF_HAS_STACKREGS(kdb_frame))
 		kdb_frame->tf_ss = *valuep;
 	return (1);
 }
 
 #define NORMAL		0
 #define	TRAP		1
 #define	INTERRUPT	2
 #define	SYSCALL		3
 #define	DOUBLE_FAULT	4
 #define	TRAP_INTERRUPT	5
 #define	TRAP_TIMERINT	6
 
 static void db_nextframe(struct i386_frame **, db_addr_t *, struct thread *);
 static int db_numargs(struct i386_frame *);
 static void db_print_stack_entry(const char *, int, char **, int *, db_addr_t,
     void *);
 static void decode_syscall(int, struct thread *);
 
 static const char * watchtype_str(int type);
 int  i386_set_watch(int watchnum, unsigned int watchaddr, int size, int access,
 		    struct dbreg *d);
 int  i386_clr_watch(int watchnum, struct dbreg *d);
 
 /*
  * Figure out how many arguments were passed into the frame at "fp".
  */
 static int
 db_numargs(fp)
 	struct i386_frame *fp;
 {
 	char   *argp;
 	int	inst;
 	int	args;
 
 	argp = (char *)db_get_value((int)&fp->f_retaddr, 4, FALSE);
 	/*
 	 * XXX etext is wrong for LKMs.  We should attempt to interpret
 	 * the instruction at the return address in all cases.  This
 	 * may require better fault handling.
 	 */
 	if (argp < btext || argp >= etext) {
 		args = -1;
 	} else {
 retry:
 		inst = db_get_value((int)argp, 4, FALSE);
 		if ((inst & 0xff) == 0x59)	/* popl %ecx */
 			args = 1;
 		else if ((inst & 0xffff) == 0xc483)	/* addl $Ibs, %esp */
 			args = ((inst >> 16) & 0xff) / 4;
 		else if ((inst & 0xf8ff) == 0xc089) {	/* movl %eax, %Reg */
 			argp += 2;
 			goto retry;
 		} else
 			args = -1;
 	}
 	return (args);
 }
 
 static void
 db_print_stack_entry(name, narg, argnp, argp, callpc, frame)
 	const char *name;
 	int narg;
 	char **argnp;
 	int *argp;
 	db_addr_t callpc;
 	void *frame;
 {
 	int n = narg >= 0 ? narg : 5;
 
 	db_printf("%s(", name);
 	while (n) {
 		if (argnp)
 			db_printf("%s=", *argnp++);
 		db_printf("%r", db_get_value((int)argp, 4, FALSE));
 		argp++;
 		if (--n != 0)
 			db_printf(",");
 	}
 	if (narg < 0)
 		db_printf(",...");
 	db_printf(") at ");
 	db_printsym(callpc, DB_STGY_PROC);
 	if (frame != NULL)
 		db_printf("/frame 0x%r", (register_t)frame);
 	db_printf("\n");
 }
 
 static void
 decode_syscall(int number, struct thread *td)
 {
 	struct proc *p;
 	c_db_sym_t sym;
 	db_expr_t diff;
 	sy_call_t *f;
 	const char *symname;
 
 	db_printf(" (%d", number);
 	p = (td != NULL) ? td->td_proc : NULL;
 	if (p != NULL && 0 <= number && number < p->p_sysent->sv_size) {
 		f = p->p_sysent->sv_table[number].sy_call;
 		sym = db_search_symbol((db_addr_t)f, DB_STGY_ANY, &diff);
 		if (sym != DB_SYM_NULL && diff == 0) {
 			db_symbol_values(sym, &symname, NULL);
 			db_printf(", %s, %s", p->p_sysent->sv_name, symname);
 		}
 	}
 	db_printf(")");
 }
 
 /*
  * Figure out the next frame up in the call stack.
  */
 static void
 db_nextframe(struct i386_frame **fp, db_addr_t *ip, struct thread *td)
 {
 	struct trapframe *tf;
 	int frame_type;
 	int eip, esp, ebp;
 	db_expr_t offset;
 	c_db_sym_t sym;
 	const char *name;
 
 	eip = db_get_value((int) &(*fp)->f_retaddr, 4, FALSE);
 	ebp = db_get_value((int) &(*fp)->f_frame, 4, FALSE);
 
 	/*
 	 * Figure out frame type.  We look at the address just before
 	 * the saved instruction pointer as the saved EIP is after the
 	 * call function, and if the function being called is marked as
 	 * dead (such as panic() at the end of dblfault_handler()), then
 	 * the instruction at the saved EIP will be part of a different
 	 * function (syscall() in this example) rather than the one that
 	 * actually made the call.
 	 */
 	frame_type = NORMAL;
-	sym = db_search_symbol(eip - 1, DB_STGY_ANY, &offset);
+	if (eip >= PMAP_TRM_MIN_ADDRESS) {
+		sym = db_search_symbol(eip - 1 - setidt_disp, DB_STGY_ANY,
+		    &offset);
+	} else {
+		sym = db_search_symbol(eip - 1, DB_STGY_ANY, &offset);
+	}
 	db_symbol_values(sym, &name, NULL);
 	if (name != NULL) {
 		if (strcmp(name, "calltrap") == 0 ||
 		    strcmp(name, "fork_trampoline") == 0)
 			frame_type = TRAP;
 		else if (strncmp(name, "Xatpic_intr", 11) == 0 ||
 		    strncmp(name, "Xapic_isr", 9) == 0)
 			frame_type = INTERRUPT;
 		else if (strcmp(name, "Xlcall_syscall") == 0 ||
 		    strcmp(name, "Xint0x80_syscall") == 0)
 			frame_type = SYSCALL;
 		else if (strcmp(name, "dblfault_handler") == 0)
 			frame_type = DOUBLE_FAULT;
 		/* XXX: These are interrupts with trap frames. */
 		else if (strcmp(name, "Xtimerint") == 0)
 			frame_type = TRAP_TIMERINT;
 		else if (strcmp(name, "Xcpustop") == 0 ||
 		    strcmp(name, "Xrendezvous") == 0 ||
 		    strcmp(name, "Xipi_intr_bitmap_handler") == 0)
 			frame_type = TRAP_INTERRUPT;
 	}
 
 	/*
 	 * Normal frames need no special processing.
 	 */
 	if (frame_type == NORMAL) {
 		*ip = (db_addr_t) eip;
 		*fp = (struct i386_frame *) ebp;
 		return;
 	}
 
 	db_print_stack_entry(name, 0, 0, 0, eip, &(*fp)->f_frame);
 
 	/*
 	 * For a double fault, we have to snag the values from the
 	 * previous TSS since a double fault uses a task gate to
 	 * switch to a known good state.
 	 */
 	if (frame_type == DOUBLE_FAULT) {
-		esp = PCPU_GET(common_tss.tss_esp);
-		eip = PCPU_GET(common_tss.tss_eip);
-		ebp = PCPU_GET(common_tss.tss_ebp);
+		esp = PCPU_GET(common_tssp)->tss_esp;
+		eip = PCPU_GET(common_tssp)->tss_eip;
+		ebp = PCPU_GET(common_tssp)->tss_ebp;
 		db_printf(
 		    "--- trap 0x17, eip = %#r, esp = %#r, ebp = %#r ---\n",
 		    eip, esp, ebp);
 		*ip = (db_addr_t) eip;
 		*fp = (struct i386_frame *) ebp;
 		return;
 	}
 
 	/*
 	 * Point to base of trapframe which is just above the
 	 * current frame.
 	 */
 	if (frame_type == INTERRUPT)
 		tf = (struct trapframe *)((int)*fp + 16);
 	else if (frame_type == TRAP_INTERRUPT)
 		tf = (struct trapframe *)((int)*fp + 8);
 	else
 		tf = (struct trapframe *)((int)*fp + 12);
 
-	if (INKERNEL((int) tf)) {
-		esp = get_esp(tf);
-		eip = tf->tf_eip;
-		ebp = tf->tf_ebp;
-		switch (frame_type) {
-		case TRAP:
-			db_printf("--- trap %#r", tf->tf_trapno);
-			break;
-		case SYSCALL:
-			db_printf("--- syscall");
-			decode_syscall(tf->tf_eax, td);
-			break;
-		case TRAP_TIMERINT:
-		case TRAP_INTERRUPT:
-		case INTERRUPT:
-			db_printf("--- interrupt");
-			break;
-		default:
-			panic("The moon has moved again.");
-		}
-		db_printf(", eip = %#r, esp = %#r, ebp = %#r ---\n", eip,
-		    esp, ebp);
+	esp = get_esp(tf);
+	eip = tf->tf_eip;
+	ebp = tf->tf_ebp;
+	switch (frame_type) {
+	case TRAP:
+		db_printf("--- trap %#r", tf->tf_trapno);
+		break;
+	case SYSCALL:
+		db_printf("--- syscall");
+		decode_syscall(tf->tf_eax, td);
+		break;
+	case TRAP_TIMERINT:
+	case TRAP_INTERRUPT:
+	case INTERRUPT:
+		db_printf("--- interrupt");
+		break;
+	default:
+		panic("The moon has moved again.");
 	}
+	db_printf(", eip = %#r, esp = %#r, ebp = %#r ---\n", eip, esp, ebp);
 
+	switch (frame_type) {
+	case TRAP:
+	case TRAP_TIMERINT:
+	case TRAP_INTERRUPT:
+	case INTERRUPT:
+		if ((tf->tf_eflags & PSL_VM) != 0 ||
+		    (tf->tf_cs & SEL_RPL_MASK) != 0)
+			ebp = 0;
+		break;
+	case SYSCALL:
+		ebp = 0;
+		break;
+	}
+	
 	*ip = (db_addr_t) eip;
 	*fp = (struct i386_frame *) ebp;
 }
 
 static int
 db_backtrace(struct thread *td, struct trapframe *tf, struct i386_frame *frame,
     db_addr_t pc, register_t sp, int count)
 {
 	struct i386_frame *actframe;
 #define MAXNARG	16
 	char *argnames[MAXNARG], **argnp = NULL;
 	const char *name;
 	int *argp;
 	db_expr_t offset;
 	c_db_sym_t sym;
 	int instr, narg;
 	boolean_t first;
 
 	if (db_segsize(tf) == 16) {
 		db_printf(
 "--- 16-bit%s, cs:eip = %#x:%#x, ss:esp = %#x:%#x, ebp = %#x, tf = %p ---\n",
 		    (tf->tf_eflags & PSL_VM) ? " (vm86)" : "",
 		    tf->tf_cs, tf->tf_eip,
 		    TF_HAS_STACKREGS(tf) ? tf->tf_ss : rss(),
 		    TF_HAS_STACKREGS(tf) ? tf->tf_esp : (intptr_t)&tf->tf_esp,
 		    tf->tf_ebp, tf);
 		return (0);
 	}
 
+	/* 'frame' can be null initially.  Just print the pc then. */
+	if (frame == NULL)
+		goto out;
+
 	/*
 	 * If an indirect call via an invalid pointer caused a trap,
 	 * %pc contains the invalid address while the return address
 	 * of the unlucky caller has been saved by CPU on the stack
 	 * just before the trap frame.  In this case, try to recover
 	 * the caller's address so that the first frame is assigned
 	 * to the right spot in the right function, for that is where
 	 * the failure actually happened.
 	 *
 	 * This trick depends on the fault address stashed in tf_err
 	 * by trap_fatal() before entering KDB.
 	 */
 	if (kdb_frame && pc == kdb_frame->tf_err) {
 		/*
 		 * Find where the trap frame actually ends.
 		 * It won't contain tf_esp or tf_ss unless crossing rings.
 		 */
 		if (TF_HAS_STACKREGS(kdb_frame))
 			instr = (int)(kdb_frame + 1);
 		else
 			instr = (int)&kdb_frame->tf_esp;
 		pc = db_get_value(instr, 4, FALSE);
 	}
 
 	if (count == -1)
 		count = 1024;
 
 	first = TRUE;
 	while (count-- && !db_pager_quit) {
 		sym = db_search_symbol(pc, DB_STGY_ANY, &offset);
 		db_symbol_values(sym, &name, NULL);
 
 		/*
 		 * Attempt to determine a (possibly fake) frame that gives
 		 * the caller's pc.  It may differ from `frame' if the
 		 * current function never sets up a standard frame or hasn't
 		 * set one up yet or has just discarded one.  The last two
 		 * cases can be guessed fairly reliably for code generated
 		 * by gcc.  The first case is too much trouble to handle in
 		 * general because the amount of junk on the stack depends
 		 * on the pc (the special handling of "calltrap", etc. in
 		 * db_nextframe() works because the `next' pc is special).
 		 */
 		actframe = frame;
 		if (first) {
 			first = FALSE;
 			if (sym == C_DB_SYM_NULL && sp != 0) {
 				/*
 				 * If a symbol couldn't be found, we've probably
 				 * jumped to a bogus location, so try and use
 				 * the return address to find our caller.
 				 */
 				db_print_stack_entry(name, 0, 0, 0, pc,
 				    NULL);
 				pc = db_get_value(sp, 4, FALSE);
 				if (db_search_symbol(pc, DB_STGY_PROC,
 				    &offset) == C_DB_SYM_NULL)
 					break;
 				continue;
 			} else if (tf != NULL) {
 				instr = db_get_value(pc, 4, FALSE);
 				if ((instr & 0xffffff) == 0x00e58955) {
 					/* pushl %ebp; movl %esp, %ebp */
 					actframe = (void *)(get_esp(tf) - 4);
 				} else if ((instr & 0xffff) == 0x0000e589) {
 					/* movl %esp, %ebp */
 					actframe = (void *)get_esp(tf);
 					if (tf->tf_ebp == 0) {
 						/* Fake frame better. */
 						frame = actframe;
 					}
 				} else if ((instr & 0xff) == 0x000000c3) {
 					/* ret */
 					actframe = (void *)(get_esp(tf) - 4);
 				} else if (offset == 0) {
 					/* Probably an assembler symbol. */
 					actframe = (void *)(get_esp(tf) - 4);
 				}
 			} else if (strcmp(name, "fork_trampoline") == 0) {
 				/*
 				 * Don't try to walk back on a stack for a
 				 * process that hasn't actually been run yet.
 				 */
 				db_print_stack_entry(name, 0, 0, 0, pc,
 				    actframe);
 				break;
 			}
 		}
 
 		argp = &actframe->f_arg0;
 		narg = MAXNARG;
 		if (sym != NULL && db_sym_numargs(sym, &narg, argnames)) {
 			argnp = argnames;
 		} else {
 			narg = db_numargs(frame);
 		}
 
 		db_print_stack_entry(name, narg, argnp, argp, pc, actframe);
 
 		if (actframe != frame) {
 			/* `frame' belongs to caller. */
 			pc = (db_addr_t)
 			    db_get_value((int)&actframe->f_retaddr, 4, FALSE);
 			continue;
 		}
 
 		db_nextframe(&frame, &pc, td);
 
-		if (INKERNEL((int)pc) && !INKERNEL((int) frame)) {
+out:
+		/*
+		 * 'frame' can be null here, either because it was initially
+		 * null or because db_nextframe() found no frame.
+		 * db_nextframe() may also have found a non-kernel frame.
+		 * !INKERNEL() classifies both.  Stop tracing if either,
+		 * after printing the pc if it is the kernel.
+		 */
+		if (frame == NULL || frame <= actframe) {
 			sym = db_search_symbol(pc, DB_STGY_ANY, &offset);
 			db_symbol_values(sym, &name, NULL);
 			db_print_stack_entry(name, 0, 0, 0, pc, frame);
-			break;
-		}
-		if (!INKERNEL((int) frame)) {
 			break;
 		}
 	}
 
 	return (0);
 }
 
 void
 db_trace_self(void)
 {
 	struct i386_frame *frame;
 	db_addr_t callpc;
 	register_t ebp;
 
 	__asm __volatile("movl %%ebp,%0" : "=r" (ebp));
 	frame = (struct i386_frame *)ebp;
 	callpc = (db_addr_t)db_get_value((int)&frame->f_retaddr, 4, FALSE);
 	frame = frame->f_frame;
 	db_backtrace(curthread, NULL, frame, callpc, 0, -1);
 }
 
 int
 db_trace_thread(struct thread *thr, int count)
 {
 	struct pcb *ctx;
 	struct trapframe *tf;
 
 	ctx = kdb_thr_ctx(thr);
 	tf = thr == kdb_thread ? kdb_frame : NULL;
 	return (db_backtrace(thr, tf, (struct i386_frame *)ctx->pcb_ebp,
 	    ctx->pcb_eip, ctx->pcb_esp, count));
 }
 
 int
 i386_set_watch(watchnum, watchaddr, size, access, d)
 	int watchnum;
 	unsigned int watchaddr;
 	int size;
 	int access;
 	struct dbreg *d;
 {
 	int i, len;
 
 	if (watchnum == -1) {
 		for (i = 0; i < 4; i++)
 			if (!DBREG_DR7_ENABLED(d->dr[7], i))
 				break;
 		if (i < 4)
 			watchnum = i;
 		else
 			return (-1);
 	}
 
 	switch (access) {
 	case DBREG_DR7_EXEC:
 		size = 1; /* size must be 1 for an execution breakpoint */
 		/* fall through */
 	case DBREG_DR7_WRONLY:
 	case DBREG_DR7_RDWR:
 		break;
 	default:
 		return (-1);
 	}
 
 	/*
 	 * we can watch a 1, 2, or 4 byte sized location
 	 */
 	switch (size) {
 	case 1:
 		len = DBREG_DR7_LEN_1;
 		break;
 	case 2:
 		len = DBREG_DR7_LEN_2;
 		break;
 	case 4:
 		len = DBREG_DR7_LEN_4;
 		break;
 	default:
 		return (-1);
 	}
 
 	/* clear the bits we are about to affect */
 	d->dr[7] &= ~DBREG_DR7_MASK(watchnum);
 
 	/* set drN register to the address, N=watchnum */
 	DBREG_DRX(d, watchnum) = watchaddr;
 
 	/* enable the watchpoint */
 	d->dr[7] |= DBREG_DR7_SET(watchnum, len, access,
 	    DBREG_DR7_GLOBAL_ENABLE);
 
 	return (watchnum);
 }
 
 
 int
 i386_clr_watch(watchnum, d)
 	int watchnum;
 	struct dbreg *d;
 {
 
 	if (watchnum < 0 || watchnum >= 4)
 		return (-1);
 
 	d->dr[7] &= ~DBREG_DR7_MASK(watchnum);
 	DBREG_DRX(d, watchnum) = 0;
 
 	return (0);
 }
 
 
 int
 db_md_set_watchpoint(addr, size)
 	db_expr_t addr;
 	db_expr_t size;
 {
 	struct dbreg d;
 	int avail, i, wsize;
 
 	fill_dbregs(NULL, &d);
 
 	avail = 0;
 	for(i = 0; i < 4; i++) {
 		if (!DBREG_DR7_ENABLED(d.dr[7], i))
 			avail++;
 	}
 
 	if (avail * 4 < size)
 		return (-1);
 
 	for (i = 0; i < 4 && (size > 0); i++) {
 		if (!DBREG_DR7_ENABLED(d.dr[7], i)) {
 			if (size > 2)
 				wsize = 4;
 			else
 				wsize = size;
 			i386_set_watch(i, addr, wsize,
 				       DBREG_DR7_WRONLY, &d);
 			addr += wsize;
 			size -= wsize;
 		}
 	}
 
 	set_dbregs(NULL, &d);
 
 	return(0);
 }
 
 
 int
 db_md_clr_watchpoint(addr, size)
 	db_expr_t addr;
 	db_expr_t size;
 {
 	struct dbreg d;
 	int i;
 
 	fill_dbregs(NULL, &d);
 
 	for(i = 0; i < 4; i++) {
 		if (DBREG_DR7_ENABLED(d.dr[7], i)) {
 			if ((DBREG_DRX((&d), i) >= addr) &&
 			    (DBREG_DRX((&d), i) < addr+size))
 				i386_clr_watch(i, &d);
 
 		}
 	}
 
 	set_dbregs(NULL, &d);
 
 	return(0);
 }
 
 
 static const char *
 watchtype_str(type)
 	int type;
 {
 	switch (type) {
 		case DBREG_DR7_EXEC   : return "execute";    break;
 		case DBREG_DR7_RDWR   : return "read/write"; break;
 		case DBREG_DR7_WRONLY : return "write";	     break;
 		default		      : return "invalid";    break;
 	}
 }
 
 
 void
 db_md_list_watchpoints(void)
 {
 	struct dbreg d;
 	int i, len, type;
 
 	fill_dbregs(NULL, &d);
 
 	db_printf("\nhardware watchpoints:\n");
 	db_printf("  watch    status        type  len     address\n");
 	db_printf("  -----  --------  ----------  ---  ----------\n");
 	for (i = 0; i < 4; i++) {
 		if (DBREG_DR7_ENABLED(d.dr[7], i)) {
 			type = DBREG_DR7_ACCESS(d.dr[7], i);
 			len = DBREG_DR7_LEN(d.dr[7], i);
 			db_printf("  %-5d  %-8s  %10s  %3d  ",
 			    i, "enabled", watchtype_str(type), len + 1);
 			db_printsym((db_addr_t)DBREG_DRX(&d, i), DB_STGY_ANY);
 			db_printf("\n");
 		} else {
 			db_printf("  %-5d  disabled\n", i);
 		}
 	}
 
 	db_printf("\ndebug register values:\n");
 	for (i = 0; i < 8; i++)
 		if (i != 4 && i != 5)
 			db_printf("  dr%d 0x%08x\n", i, DBREG_DRX(&d, i));
 	db_printf("\n");
 }
Index: head/sys/i386/i386/elf_machdep.c
===================================================================
--- head/sys/i386/i386/elf_machdep.c	(revision 332488)
+++ head/sys/i386/i386/elf_machdep.c	(revision 332489)
@@ -1,283 +1,282 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright 1996-1998 John D. Polstra.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/linker.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/imgact_elf.h>
 #include <sys/syscall.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 
 #include <machine/elf.h>
 #include <machine/md_var.h>
 #include <machine/npx.h>
 
 struct sysentvec elf32_freebsd_sysvec = {
 	.sv_size	= SYS_MAXSYSCALL,
 	.sv_table	= sysent,
 	.sv_mask	= 0,
 	.sv_errsize	= 0,
 	.sv_errtbl	= NULL,
 	.sv_transtrap	= NULL,
 	.sv_fixup	= __elfN(freebsd_fixup),
 	.sv_sendsig	= sendsig,
 	.sv_sigcode	= sigcode,
 	.sv_szsigcode	= &szsigcode,
 	.sv_name	= "FreeBSD ELF32",
 	.sv_coredump	= __elfN(coredump),
 	.sv_imgact_try	= NULL,
 	.sv_minsigstksz	= MINSIGSTKSZ,
 	.sv_pagesize	= PAGE_SIZE,
 	.sv_minuser	= VM_MIN_ADDRESS,
 	.sv_maxuser	= VM_MAXUSER_ADDRESS,
 	.sv_usrstack	= USRSTACK,
 	.sv_psstrings	= PS_STRINGS,
 	.sv_stackprot	= VM_PROT_ALL,
 	.sv_copyout_strings	= exec_copyout_strings,
 	.sv_setregs	= exec_setregs,
 	.sv_fixlimit	= NULL,
 	.sv_maxssiz	= NULL,
 	.sv_flags	= SV_ABI_FREEBSD | SV_IA32 | SV_ILP32 | SV_SHP |
 			    SV_TIMEKEEP,
 	.sv_set_syscall_retval = cpu_set_syscall_retval,
 	.sv_fetch_syscall_args = cpu_fetch_syscall_args,
 	.sv_syscallnames = syscallnames,
 	.sv_shared_page_base = SHAREDPAGE,
 	.sv_shared_page_len = PAGE_SIZE,
 	.sv_schedtail	= NULL,
 	.sv_thread_detach = NULL,
 	.sv_trap	= NULL,
 };
 INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec);
 
 static Elf32_Brandinfo freebsd_brand_info = {
 	.brand		= ELFOSABI_FREEBSD,
 	.machine	= EM_386,
 	.compat_3_brand	= "FreeBSD",
 	.emul_path	= NULL,
 	.interp_path	= "/libexec/ld-elf.so.1",
 	.sysvec		= &elf32_freebsd_sysvec,
 	.interp_newpath	= NULL,
 	.brand_note	= &elf32_freebsd_brandnote,
 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
 };
 
 SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST,
 	(sysinit_cfunc_t) elf32_insert_brand_entry,
 	&freebsd_brand_info);
 
 static Elf32_Brandinfo freebsd_brand_oinfo = {
 	.brand		= ELFOSABI_FREEBSD,
 	.machine	= EM_386,
 	.compat_3_brand	= "FreeBSD",
 	.emul_path	= NULL,
 	.interp_path	= "/usr/libexec/ld-elf.so.1",
 	.sysvec		= &elf32_freebsd_sysvec,
 	.interp_newpath	= NULL,
 	.brand_note	= &elf32_freebsd_brandnote,
 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
 };
 
 SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY,
 	(sysinit_cfunc_t) elf32_insert_brand_entry,
 	&freebsd_brand_oinfo);
 
 static Elf32_Brandinfo kfreebsd_brand_info = {
 	.brand		= ELFOSABI_FREEBSD,
 	.machine	= EM_386,
 	.compat_3_brand	= "FreeBSD",
 	.emul_path	= NULL,
 	.interp_path	= "/lib/ld.so.1",
 	.sysvec		= &elf32_freebsd_sysvec,
 	.interp_newpath	= NULL,
 	.brand_note	= &elf32_kfreebsd_brandnote,
 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
 };
 
 SYSINIT(kelf32, SI_SUB_EXEC, SI_ORDER_ANY,
 	(sysinit_cfunc_t) elf32_insert_brand_entry,
 	&kfreebsd_brand_info);
 
-
 void
 elf32_dump_thread(struct thread *td, void *dst, size_t *off)
 {
 	void *buf;
 	size_t len;
 
 	len = 0;
 	if (use_xsave) {
 		if (dst != NULL) {
 			npxgetregs(td);
 			len += elf32_populate_note(NT_X86_XSTATE,
 			    get_pcb_user_save_td(td), dst,
 			    cpu_max_ext_state_size, &buf);
 			*(uint64_t *)((char *)buf + X86_XSTATE_XCR0_OFFSET) =
 			    xsave_mask;
 		} else
 			len += elf32_populate_note(NT_X86_XSTATE, NULL, NULL,
 			    cpu_max_ext_state_size, NULL);
 	}
 	*off = len;
 }
 
 /* Process one elf relocation with addend. */
 static int
 elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data,
     int type, int local, elf_lookup_fn lookup)
 {
 	Elf_Addr *where;
 	Elf_Addr addr;
 	Elf_Addr addend;
 	Elf_Word rtype, symidx;
 	const Elf_Rel *rel;
 	const Elf_Rela *rela;
 	int error;
 
 	switch (type) {
 	case ELF_RELOC_REL:
 		rel = (const Elf_Rel *)data;
 		where = (Elf_Addr *) (relocbase + rel->r_offset);
 		addend = *where;
 		rtype = ELF_R_TYPE(rel->r_info);
 		symidx = ELF_R_SYM(rel->r_info);
 		break;
 	case ELF_RELOC_RELA:
 		rela = (const Elf_Rela *)data;
 		where = (Elf_Addr *) (relocbase + rela->r_offset);
 		addend = rela->r_addend;
 		rtype = ELF_R_TYPE(rela->r_info);
 		symidx = ELF_R_SYM(rela->r_info);
 		break;
 	default:
 		panic("unknown reloc type %d\n", type);
 	}
 
 	if (local) {
 		if (rtype == R_386_RELATIVE) {	/* A + B */
 			addr = elf_relocaddr(lf, relocbase + addend);
 			if (*where != addr)
 				*where = addr;
 		}
 		return (0);
 	}
 
 	switch (rtype) {
 
 		case R_386_NONE:	/* none */
 			break;
 
 		case R_386_32:		/* S + A */
 			error = lookup(lf, symidx, 1, &addr);
 			if (error != 0)
 				return -1;
 			addr += addend;
 			if (*where != addr)
 				*where = addr;
 			break;
 
 		case R_386_PC32:	/* S + A - P */
 			error = lookup(lf, symidx, 1, &addr);
 			if (error != 0)
 				return -1;
 			addr += addend - (Elf_Addr)where;
 			if (*where != addr)
 				*where = addr;
 			break;
 
 		case R_386_COPY:	/* none */
 			/*
 			 * There shouldn't be copy relocations in kernel
 			 * objects.
 			 */
 			printf("kldload: unexpected R_COPY relocation\n");
 			return -1;
 			break;
 
 		case R_386_GLOB_DAT:	/* S */
 			error = lookup(lf, symidx, 1, &addr);
 			if (error != 0)
 				return -1;
 			if (*where != addr)
 				*where = addr;
 			break;
 
 		case R_386_RELATIVE:
 			break;
 
 		default:
 			printf("kldload: unexpected relocation type %d\n",
 			       rtype);
 			return -1;
 	}
 	return(0);
 }
 
 int
 elf_reloc(linker_file_t lf, Elf_Addr relocbase, const void *data, int type,
     elf_lookup_fn lookup)
 {
 
 	return (elf_reloc_internal(lf, relocbase, data, type, 0, lookup));
 }
 
 int
 elf_reloc_local(linker_file_t lf, Elf_Addr relocbase, const void *data,
     int type, elf_lookup_fn lookup)
 {
 
 	return (elf_reloc_internal(lf, relocbase, data, type, 1, lookup));
 }
 
 int
 elf_cpu_load_file(linker_file_t lf __unused)
 {
 
 	return (0);
 }
 
 int
 elf_cpu_unload_file(linker_file_t lf __unused)
 {
 
 	return (0);
 }
Index: head/sys/i386/i386/exception.s
===================================================================
--- head/sys/i386/i386/exception.s	(revision 332488)
+++ head/sys/i386/i386/exception.s	(revision 332489)
@@ -1,515 +1,608 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
- * Copyright (c) 2007 The FreeBSD Foundation
+ * Copyright (c) 2007, 2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
+ * Portions of this software were developed by Konstantin Belousov
+ * <kib@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_apic.h"
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
 
-#include <machine/asmacros.h>
+#include "assym.inc"
+
 #include <machine/psl.h>
+#include <machine/asmacros.h>
 #include <machine/trap.h>
 
-#include "assym.inc"
-
-#define	SEL_RPL_MASK	0x0003
-#define	GSEL_KPL	0x0020	/* GSEL(GCODE_SEL, SEL_KPL) */
-
 #ifdef KDTRACE_HOOKS
 	.bss
 	.globl	dtrace_invop_jump_addr
 	.align	4
 	.type	dtrace_invop_jump_addr, @object
 	.size	dtrace_invop_jump_addr, 4
 dtrace_invop_jump_addr:
 	.zero	4
 	.globl	dtrace_invop_calltrap_addr
 	.align	4
 	.type	dtrace_invop_calltrap_addr, @object
 	.size	dtrace_invop_calltrap_addr, 4
 dtrace_invop_calltrap_addr:
 	.zero	8
 #endif
 	.text
-#ifdef HWPMC_HOOKS
-	ENTRY(start_exceptions)
-#endif
+ENTRY(start_exceptions)
+	.globl	tramp_idleptd
+tramp_idleptd:	.long	0
+
 /*****************************************************************************/
 /* Trap handling                                                             */
 /*****************************************************************************/
 /*
  * Trap and fault vector routines.
  *
- * Most traps are 'trap gates', SDT_SYS386TGT.  A trap gate pushes state on
- * the stack that mostly looks like an interrupt, but does not disable
- * interrupts.  A few of the traps we are use are interrupt gates,
- * SDT_SYS386IGT, which are nearly the same thing except interrupts are
- * disabled on entry.
+ * All traps are 'interrupt gates', SDT_SYS386IGT.  Interrupts are disabled
+ * by hardware to not allow interrupts until code switched to the kernel
+ * address space and the kernel thread stack.
  *
  * The cpu will push a certain amount of state onto the kernel stack for
  * the current process.  The amount of state depends on the type of trap
  * and whether the trap crossed rings or not.  See i386/include/frame.h.
  * At the very least the current EFLAGS (status register, which includes
  * the interrupt disable state prior to the trap), the code segment register,
  * and the return instruction pointer are pushed by the cpu.  The cpu
  * will also push an 'error' code for certain traps.  We push a dummy
  * error code for those traps where the cpu doesn't in order to maintain
  * a consistent frame.  We also push a contrived 'trap number'.
  *
  * The cpu does not push the general registers, we must do that, and we
  * must restore them prior to calling 'iret'.  The cpu adjusts the %cs and
  * %ss segment registers, but does not mess with %ds, %es, or %fs.  Thus we
  * must load them with appropriate values for supervisor mode operation.
+ *
+ * This code is not executed at the linked address, it is copied to the
+ * trampoline area.  As the consequence, all code there and in included files
+ * must be PIC.
  */
 
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
 #define	TRAP(a)		pushl $(a) ; jmp alltraps
 
 IDTVEC(div)
 	pushl $0; TRAP(T_DIVIDE)
 IDTVEC(dbg)
 	pushl $0; TRAP(T_TRCTRAP)
-IDTVEC(nmi)
-	pushl $0; TRAP(T_NMI)
 IDTVEC(bpt)
 	pushl $0; TRAP(T_BPTFLT)
 IDTVEC(dtrace_ret)
 	pushl $0; TRAP(T_DTRACE_RET)
 IDTVEC(ofl)
 	pushl $0; TRAP(T_OFLOW)
 IDTVEC(bnd)
 	pushl $0; TRAP(T_BOUND)
 #ifndef KDTRACE_HOOKS
 IDTVEC(ill)
 	pushl $0; TRAP(T_PRIVINFLT)
 #endif
 IDTVEC(dna)
 	pushl $0; TRAP(T_DNA)
 IDTVEC(fpusegm)
 	pushl $0; TRAP(T_FPOPFLT)
 IDTVEC(tss)
 	TRAP(T_TSSFLT)
 IDTVEC(missing)
-	TRAP(T_SEGNPFLT)
+	pushl	$T_SEGNPFLT
+	jmp	irettraps
 IDTVEC(stk)
-	TRAP(T_STKFLT)
+	pushl	$T_STKFLT
+	jmp	irettraps
 IDTVEC(prot)
-	TRAP(T_PROTFLT)
+	pushl	$T_PROTFLT
+	jmp	irettraps
 IDTVEC(page)
-	TRAP(T_PAGEFLT)
-IDTVEC(mchk)
-	pushl $0; TRAP(T_MCHK)
+	cmpl	$PMAP_TRM_MIN_ADDRESS, TF_EIP-TF_ERR(%esp)
+	jb	1f
+	movl	%ebx, %cr3
+	movl	%edx, TF_EIP-TF_ERR(%esp)
+	addl	$4, %esp
+	iret
+1:	pushl	$T_PAGEFLT
+	jmp	alltraps
 IDTVEC(rsvd_pti)
 IDTVEC(rsvd)
 	pushl $0; TRAP(T_RESERVED)
 IDTVEC(fpu)
 	pushl $0; TRAP(T_ARITHTRAP)
 IDTVEC(align)
 	TRAP(T_ALIGNFLT)
 IDTVEC(xmm)
 	pushl $0; TRAP(T_XMMFLT)
 
 	/*
-	 * All traps except ones for syscalls jump to alltraps.  If
+	 * All traps except ones for syscalls or invalid segment,
+	 * jump to alltraps.  If
 	 * interrupts were enabled when the trap occurred, then interrupts
 	 * are enabled now if the trap was through a trap gate, else
 	 * disabled if the trap was through an interrupt gate.  Note that
 	 * int0x80_syscall is a trap gate.   Interrupt gates are used by
 	 * page faults, non-maskable interrupts, debug and breakpoint
 	 * exceptions.
 	 */
 	SUPERALIGN_TEXT
 	.globl	alltraps
 	.type	alltraps,@function
 alltraps:
-	pushal
-	pushl	$0
-	movw	%ds,(%esp)
-	pushl	$0
-	movw	%es,(%esp)
-	pushl	$0
-	movw	%fs,(%esp)
+	PUSH_FRAME2
 alltraps_with_regs_pushed:
 	SET_KERNEL_SREGS
 	cld
+	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
 calltrap:
 	pushl	%esp
-	call	trap
+	movl	$trap,%eax
+	call	*%eax
 	add	$4, %esp
 
 	/*
 	 * Return via doreti to handle ASTs.
 	 */
 	MEXITCOUNT
 	jmp	doreti
 
+	.globl	irettraps
+	.type	irettraps,@function
+irettraps:
+	testl	$PSL_VM, TF_EFLAGS-TF_TRAPNO(%esp)
+	jnz	alltraps
+	testb	$SEL_RPL_MASK, TF_CS-TF_TRAPNO(%esp)
+	jnz	alltraps
+
+	/*
+	 * Kernel mode.
+	 * The special case there is the kernel mode with user %cr3 and
+	 * trampoline stack. We need to copy both current frame and the
+	 * hardware portion of the frame we tried to return to, to the
+	 * normal stack.  This logic must follow the stack unwind order
+	 * in doreti.
+	 */
+	PUSH_FRAME2
+	SET_KERNEL_SREGS
+	cld
+	call	1f
+1:	popl	%ebx
+	leal	(doreti_iret - 1b)(%ebx), %edx
+	cmpl	%edx, TF_EIP(%esp)
+	jne	2f
+	movl	$(2 * TF_SZ - TF_EIP), %ecx
+	jmp	6f
+2:	leal	(doreti_popl_ds - 1b)(%ebx), %edx
+	cmpl	%edx, TF_EIP(%esp)
+	jne	3f
+	movl	$(2 * TF_SZ - TF_DS), %ecx
+	jmp	6f
+3:	leal	(doreti_popl_es - 1b)(%ebx), %edx
+	cmpl	%edx, TF_EIP(%esp)
+	jne	4f
+	movl	$(2 * TF_SZ - TF_ES), %ecx
+	jmp	6f
+4:	leal	(doreti_popl_fs - 1b)(%ebx), %edx
+	cmpl	%edx, TF_EIP(%esp)
+	jne	5f
+	movl	$(2 * TF_SZ - TF_FS), %ecx
+	jmp	6f
+	/* kernel mode, normal */
+5:	FAKE_MCOUNT(TF_EIP(%esp))
+	jmp	calltrap
+6:	cmpl	$PMAP_TRM_MIN_ADDRESS, %esp	/* trampoline stack ? */
+	jb	5b	/* if not, no need to change stacks */
+	movl	(tramp_idleptd - 1b)(%ebx), %eax
+	movl	%eax, %cr3
+	movl	PCPU(KESP0), %edx
+	subl	%ecx, %edx
+	movl	%edx, %edi
+	movl	%esp, %esi
+	rep; movsb
+	movl	%edx, %esp
+	FAKE_MCOUNT(TF_EIP(%esp))
+	jmp	calltrap
+
 /*
  * Privileged instruction fault.
  */
 #ifdef KDTRACE_HOOKS
 	SUPERALIGN_TEXT
 IDTVEC(ill)
 	/*
-	 * Check if a DTrace hook is registered.  The default (data) segment
-	 * cannot be used for this since %ds is not known good until we
-	 * verify that the entry was from kernel mode.
-	 */
-	cmpl	$0,%ss:dtrace_invop_jump_addr
-	je	norm_ill
-
-	/*
 	 * Check if this is a user fault.  If so, just handle it as a normal
 	 * trap.
 	 */
-	cmpl	$GSEL_KPL, 4(%esp)	/* Check the code segment */
-	jne	norm_ill
 	testl	$PSL_VM, 8(%esp)	/* and vm86 mode. */
 	jnz	norm_ill
+	cmpl	$GSEL_KPL, 4(%esp)	/* Check the code segment */
+	jne	norm_ill
 
 	/*
+	 * Check if a DTrace hook is registered.  The trampoline cannot
+	 * be instrumented.
+	 */
+	cmpl	$0, dtrace_invop_jump_addr
+	je	norm_ill
+
+	/*
 	 * This is a kernel instruction fault that might have been caused
 	 * by a DTrace provider.
 	 */
 	pushal
 	cld
 
 	/*
 	 * Set our jump address for the jump back in the event that
 	 * the exception wasn't caused by DTrace at all.
 	 */
 	movl	$norm_ill, dtrace_invop_calltrap_addr
 
 	/* Jump to the code hooked in by DTrace. */
 	jmpl	*dtrace_invop_jump_addr
 
 	/*
 	 * Process the instruction fault in the normal way.
 	 */
 norm_ill:
-	pushl $0
-	TRAP(T_PRIVINFLT)
+	pushl	$0
+	pushl	$T_PRIVINFLT
+	jmp	alltraps
 #endif
 
-/*
- * Call gate entry for syscalls (lcall 7,0).
- * This is used by FreeBSD 1.x a.out executables and "old" NetBSD executables.
- *
- * The intersegment call has been set up to specify one dummy parameter.
- * This leaves a place to put eflags so that the call frame can be
- * converted to a trap frame. Note that the eflags is (semi-)bogusly
- * pushed into (what will be) tf_err and then copied later into the
- * final spot. It has to be done this way because esp can't be just
- * temporarily altered for the pushfl - an interrupt might come in
- * and clobber the saved cs/eip.
- */
-	SUPERALIGN_TEXT
-IDTVEC(lcall_syscall)
-	pushfl				/* save eflags */
-	popl	8(%esp)			/* shuffle into tf_eflags */
-	pushl	$7			/* sizeof "lcall 7,0" */
-	pushl	$0			/* tf_trapno */
-	pushal
+IDTVEC(mchk)
 	pushl	$0
-	movw	%ds,(%esp)
+	pushl	$T_MCHK
+	jmp	nmi_mchk_common
+
+IDTVEC(nmi)
 	pushl	$0
-	movw	%es,(%esp)
-	pushl	$0
-	movw	%fs,(%esp)
+	pushl	$T_NMI
+nmi_mchk_common:
+	PUSH_FRAME2
 	SET_KERNEL_SREGS
 	cld
+	/*
+	 * Save %cr3 into tf_err.  There is no good place to put it.
+	 * Always reload %cr3, since we might have interrupted the
+	 * kernel entry or exit.
+	 * Do not switch to the thread kernel stack, otherwise we might
+	 * obliterate the previous context partially copied from the
+	 * trampoline stack.
+	 */
+	movl	%cr3, %eax
+	movl	%eax, TF_ERR(%esp)
+	call	1f
+1:	popl	%eax
+	movl	(tramp_idleptd - 1b)(%eax), %eax
+	movl	%eax, %cr3
 	FAKE_MCOUNT(TF_EIP(%esp))
-	pushl	%esp
-	call	syscall
-	add	$4, %esp
-	MEXITCOUNT
-	jmp	doreti
+	jmp	calltrap
 
 /*
  * Trap gate entry for syscalls (int 0x80).
- * This is used by FreeBSD ELF executables, "new" NetBSD executables, and all
+ * This is used by FreeBSD ELF executables, "new" a.out executables, and all
  * Linux executables.
  *
  * Even though the name says 'int0x80', this is actually a trap gate, not an
  * interrupt gate.  Thus interrupts are enabled on entry just as they are for
  * a normal syscall.
  */
 	SUPERALIGN_TEXT
 IDTVEC(int0x80_syscall)
 	pushl	$2			/* sizeof "int 0x80" */
 	pushl	$0			/* tf_trapno */
-	pushal
-	pushl	$0
-	movw	%ds,(%esp)
-	pushl	$0
-	movw	%es,(%esp)
-	pushl	$0
-	movw	%fs,(%esp)
+	PUSH_FRAME2
 	SET_KERNEL_SREGS
 	cld
+	MOVE_STACKS
+	sti
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
-	call	syscall
+	movl	$syscall, %eax
+	call	*%eax
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
 ENTRY(fork_trampoline)
 	pushl	%esp			/* trapframe pointer */
 	pushl	%ebx			/* arg1 */
 	pushl	%esi			/* function */
-	call	fork_exit
+	movl	$fork_exit, %eax
+	call	*%eax
 	addl	$12,%esp
 	/* cut from syscall */
 
 	/*
 	 * Return via doreti to handle ASTs.
 	 */
 	MEXITCOUNT
 	jmp	doreti
 
 
 /*
  * To efficiently implement classification of trap and interrupt handlers
  * for profiling, there must be only trap handlers between the labels btrap
  * and bintr, and only interrupt handlers between the labels bintr and
  * eintr.  This is implemented (partly) by including files that contain
  * some of the handlers.  Before including the files, set up a normal asm
  * environment so that the included files doen't need to know that they are
  * included.
  */
 
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 MCOUNT_LABEL(bintr)
 
 #ifdef DEV_ATPIC
 #include <i386/i386/atpic_vector.s>
 #endif
 
 #if defined(DEV_APIC) && defined(DEV_ATPIC)
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 #endif
 
 #ifdef DEV_APIC
 #include <i386/i386/apic_vector.s>
 #endif
 
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 #include <i386/i386/vm86bios.s>
 
 	.text
 MCOUNT_LABEL(eintr)
 
+#include <i386/i386/copyout_fast.s>
+
 /*
  * void doreti(struct trapframe)
  *
  * Handle return from interrupts, traps and syscalls.
  */
 	.text
 	SUPERALIGN_TEXT
 	.type	doreti,@function
 	.globl	doreti
 doreti:
 	FAKE_MCOUNT($bintr)		/* init "from" bintr -> doreti */
 doreti_next:
 	/*
 	 * Check if ASTs can be handled now.  ASTs cannot be safely
 	 * processed when returning from an NMI.
 	 */
 	cmpb	$T_NMI,TF_TRAPNO(%esp)
 #ifdef HWPMC_HOOKS
 	je	doreti_nmi
 #else
 	je	doreti_exit
 #endif
 	/*
 	 * PSL_VM must be checked first since segment registers only
 	 * have an RPL in non-VM86 mode.
 	 * ASTs can not be handled now if we are in a vm86 call.
 	 */
 	testl	$PSL_VM,TF_EFLAGS(%esp)
 	jz	doreti_notvm86
 	movl	PCPU(CURPCB),%ecx
 	testl	$PCB_VM86CALL,PCB_FLAGS(%ecx)
 	jz	doreti_ast
-	jmp	doreti_exit
+	jmp	doreti_popl_fs
 
 doreti_notvm86:
 	testb	$SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */
 	jz	doreti_exit		/* can't handle ASTs now if not */
 
 doreti_ast:
 	/*
 	 * Check for ASTs atomically with returning.  Disabling CPU
 	 * interrupts provides sufficient locking even in the SMP case,
 	 * since we will be informed of any new ASTs by an IPI.
 	 */
 	cli
 	movl	PCPU(CURTHREAD),%eax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax)
 	je	doreti_exit
 	sti
 	pushl	%esp			/* pass a pointer to the trapframe */
-	call	ast
+	movl	$ast, %eax
+	call	*%eax
 	add	$4,%esp
 	jmp	doreti_ast
 
 	/*
 	 * doreti_exit:	pop registers, iret.
 	 *
 	 *	The segment register pop is a special case, since it may
 	 *	fault if (for example) a sigreturn specifies bad segment
 	 *	registers.  The fault is handled in trap.c.
 	 */
 doreti_exit:
 	MEXITCOUNT
 
+	cmpl	$T_NMI, TF_TRAPNO(%esp)
+	je	doreti_iret_nmi
+	cmpl	$T_MCHK, TF_TRAPNO(%esp)
+	je	doreti_iret_nmi
+	testl	$SEL_RPL_MASK, TF_CS(%esp)
+	jz	doreti_popl_fs
+	movl	%esp, %esi
+	movl	PCPU(TRAMPSTK), %edx
+	movl	$TF_SZ, %ecx
+	subl	%ecx, %edx
+	movl	%edx, %edi
+	rep; movsb
+	movl	%edx, %esp
+	movl	PCPU(CURPCB),%eax
+	movl	PCB_CR3(%eax), %eax
+	movl	%eax, %cr3
+
 	.globl	doreti_popl_fs
 doreti_popl_fs:
 	popl	%fs
 	.globl	doreti_popl_es
 doreti_popl_es:
 	popl	%es
 	.globl	doreti_popl_ds
 doreti_popl_ds:
 	popl	%ds
 	popal
 	addl	$8,%esp
 	.globl	doreti_iret
 doreti_iret:
 	iret
 
+doreti_iret_nmi:
+	movl	TF_ERR(%esp), %eax
+	movl	%eax, %cr3
+	jmp	doreti_popl_fs
+
 	/*
 	 * doreti_iret_fault and friends.  Alternative return code for
 	 * the case where we get a fault in the doreti_exit code
 	 * above.  trap() (i386/i386/trap.c) catches this specific
 	 * case, and continues in the corresponding place in the code
 	 * below.
 	 *
 	 * If the fault occured during return to usermode, we recreate
 	 * the trap frame and call trap() to send a signal.  Otherwise
 	 * the kernel was tricked into fault by attempt to restore invalid
 	 * usermode segment selectors on return from nested fault or
 	 * interrupt, where interrupted kernel entry code not yet loaded
 	 * kernel selectors.  In the latter case, emulate iret and zero
 	 * the invalid selector.
 	 */
 	ALIGN_TEXT
 	.globl	doreti_iret_fault
 doreti_iret_fault:
-	subl	$8,%esp
+	pushl	$0	/* tf_err */
+	pushl	$0	/* tf_trapno XXXKIB: provide more useful value ? */
 	pushal
 	pushl	$0
 	movw	%ds,(%esp)
 	.globl	doreti_popl_ds_fault
 doreti_popl_ds_fault:
 	testb	$SEL_RPL_MASK,TF_CS-TF_DS(%esp)
 	jz	doreti_popl_ds_kfault
 	pushl	$0
 	movw	%es,(%esp)
 	.globl	doreti_popl_es_fault
 doreti_popl_es_fault:
 	testb	$SEL_RPL_MASK,TF_CS-TF_ES(%esp)
 	jz	doreti_popl_es_kfault
 	pushl	$0
 	movw	%fs,(%esp)
 	.globl	doreti_popl_fs_fault
 doreti_popl_fs_fault:
 	testb	$SEL_RPL_MASK,TF_CS-TF_FS(%esp)
 	jz	doreti_popl_fs_kfault
-	sti
 	movl	$0,TF_ERR(%esp)	/* XXX should be the error code */
 	movl	$T_PROTFLT,TF_TRAPNO(%esp)
-	jmp	alltraps_with_regs_pushed
+	SET_KERNEL_SREGS
+	jmp	calltrap
 
 doreti_popl_ds_kfault:
 	movl	$0,(%esp)
 	jmp	doreti_popl_ds
 doreti_popl_es_kfault:
 	movl	$0,(%esp)
 	jmp	doreti_popl_es
 doreti_popl_fs_kfault:
 	movl	$0,(%esp)
 	jmp	doreti_popl_fs
-	
+
 #ifdef HWPMC_HOOKS
 doreti_nmi:
 	/*
 	 * Since we are returning from an NMI, check if the current trap
 	 * was from user mode and if so whether the current thread
 	 * needs a user call chain capture.
 	 */
+	testl	$PSL_VM, TF_EFLAGS(%esp)
+	jnz	doreti_exit
 	testb	$SEL_RPL_MASK,TF_CS(%esp)
 	jz	doreti_exit
 	movl	PCPU(CURTHREAD),%eax	/* curthread present? */
 	orl	%eax,%eax
 	jz	doreti_exit
 	testl	$TDP_CALLCHAIN,TD_PFLAGS(%eax) /* flagged for capture? */
 	jz	doreti_exit
 	/*
+	 * Switch to thread stack.  Reset tf_trapno to not indicate NMI,
+	 * to cause normal userspace exit.
+	 */
+	movl	$T_RESERVED, TF_TRAPNO(%esp)
+	NMOVE_STACKS
+	/*
 	 * Take the processor out of NMI mode by executing a fake "iret".
 	 */
 	pushfl
 	pushl	%cs
-	pushl	$outofnmi
+	call	1f
+1:	popl	%eax
+	leal	(outofnmi-1b)(%eax),%eax
+	pushl	%eax
 	iret
 outofnmi:
 	/*
 	 * Call the callchain capture hook after turning interrupts back on.
 	 */
 	movl	pmc_hook,%ecx
 	orl	%ecx,%ecx
 	jz	doreti_exit
 	pushl	%esp			/* frame pointer */
 	pushl	$PMC_FN_USER_CALLCHAIN	/* command */
 	movl	PCPU(CURTHREAD),%eax
 	pushl	%eax			/* curthread */
 	sti
 	call	*%ecx
 	addl	$12,%esp
 	jmp	doreti_ast
-	ENTRY(end_exceptions)
 #endif
+
+ENTRY(end_exceptions)
Index: head/sys/i386/i386/genassym.c
===================================================================
--- head/sys/i386/i386/genassym.c	(revision 332488)
+++ head/sys/i386/i386/genassym.c	(revision 332489)
@@ -1,233 +1,244 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)genassym.c	5.11 (Berkeley) 5/10/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/assym.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/ucontext.h>
 #include <machine/bootinfo.h>
 #include <machine/tss.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/proc.h>
 #include <net/if.h>
 #include <netinet/in.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfs/nfsdiskless.h>
 #ifdef DEV_APIC
 #include <x86/apicreg.h>
 #endif
 #include <machine/cpu.h>
+#include <machine/pcb_ext.h>
 #include <machine/pcb.h>
 #include <machine/sigframe.h>
 #include <machine/vm86.h>
 #include <machine/proc.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
 ASSYM(TD_MD, offsetof(struct thread, td_md));
 
 ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN);
 
 ASSYM(P_MD, offsetof(struct proc, p_md));
 ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
 
 ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
 ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
 
 ASSYM(TD0_KSTACK_PAGES, TD0_KSTACK_PAGES);
 ASSYM(PAGE_SIZE, PAGE_SIZE);
 ASSYM(NPTEPG, NPTEPG);
 ASSYM(NPDEPG, NPDEPG);
 ASSYM(NPDEPTD, NPDEPTD);
 ASSYM(NPGPTD, NPGPTD);
 ASSYM(PDESIZE, sizeof(pd_entry_t));
 ASSYM(PTESIZE, sizeof(pt_entry_t));
 ASSYM(PDESHIFT, PDESHIFT);
 ASSYM(PTESHIFT, PTESHIFT);
 ASSYM(PAGE_SHIFT, PAGE_SHIFT);
 ASSYM(PAGE_MASK, PAGE_MASK);
 ASSYM(PDRSHIFT, PDRSHIFT);
 ASSYM(PDRMASK, PDRMASK);
 ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS);
 ASSYM(KERNBASE, KERNBASE);
 ASSYM(KERNLOAD, KERNLOAD);
 ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0));
 ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2));
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
 ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4));
 ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi));
 ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi));
 ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp));
 ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp));
 ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx));
 ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip));
 ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0));
 
 ASSYM(PCB_DS, offsetof(struct pcb, pcb_ds));
 ASSYM(PCB_ES, offsetof(struct pcb, pcb_es));
 ASSYM(PCB_FS, offsetof(struct pcb, pcb_fs));
 ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs));
 ASSYM(PCB_SS, offsetof(struct pcb, pcb_ss));
 ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
 ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
 ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
 ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3));
 ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6));
 ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7));
 ASSYM(PCB_DBREGS, PCB_DBREGS);
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 
+ASSYM(PCB_EXT_TSS, offsetof(struct pcb_ext, ext_tss));
+
 ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd));
 ASSYM(PCB_GSD, offsetof(struct pcb, pcb_gsd));
 ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
 
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_VM86CALL, PCB_VM86CALL);
 
 ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt));
 ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt));
 ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt));
 ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
 
 ASSYM(TF_FS, offsetof(struct trapframe, tf_fs));
 ASSYM(TF_ES, offsetof(struct trapframe, tf_es));
 ASSYM(TF_DS, offsetof(struct trapframe, tf_ds));
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip));
 ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
 ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags));
+ASSYM(TF_SZ, sizeof(struct trapframe));
 
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 #ifdef COMPAT_43
 ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc));
 #endif
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 #ifdef COMPAT_FREEBSD4
 ASSYM(SIGF_UC4, offsetof(struct sigframe4, sf_uc));
 #endif
 #ifdef COMPAT_43
 ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps));
 ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs));
 ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs));
 ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno));
 #endif
 #ifdef COMPAT_FREEBSD4
 ASSYM(UC4_EFLAGS, offsetof(struct ucontext4, uc_mcontext.mc_eflags));
 ASSYM(UC4_GS, offsetof(struct ucontext4, uc_mcontext.mc_gs));
 #endif
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags));
 ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs));
 ASSYM(ENOENT, ENOENT);
 ASSYM(EFAULT, EFAULT);
 ASSYM(ENAMETOOLONG, ENAMETOOLONG);
 ASSYM(MAXCOMLEN, MAXCOMLEN);
 ASSYM(MAXPATHLEN, MAXPATHLEN);
 ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo));
 ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version));
 ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname));
 ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless));
 ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon));
 ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless));
 ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size));
 ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab));
 ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab));
 ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend));
 ASSYM(PC_SIZEOF, sizeof(struct pcpu));
 ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace));
 ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
 ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread));
 ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread));
 ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
-ASSYM(PC_COMMON_TSS, offsetof(struct pcpu, pc_common_tss));
+ASSYM(PC_COMMON_TSSP, offsetof(struct pcpu, pc_common_tssp));
 ASSYM(PC_COMMON_TSSD, offsetof(struct pcpu, pc_common_tssd));
 ASSYM(PC_TSS_GDT, offsetof(struct pcpu, pc_tss_gdt));
 ASSYM(PC_FSGS_GDT, offsetof(struct pcpu, pc_fsgs_gdt));
 ASSYM(PC_CURRENTLDT, offsetof(struct pcpu, pc_currentldt));
 ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
 ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
 ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss));
+ASSYM(PC_KESP0, offsetof(struct pcpu, pc_kesp0));
+ASSYM(PC_TRAMPSTK, offsetof(struct pcpu, pc_trampstk));
+ASSYM(PC_COPYOUT_BUF, offsetof(struct pcpu, pc_copyout_buf));
 
 #ifdef DEV_APIC
 ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
 ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL);
 #endif
 
 ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL));
 ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL));
 ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL));
 
 ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL));
 ASSYM(GPROC0_SEL, GPROC0_SEL);
 ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame));
+ASSYM(VM86_STACK_SPACE, VM86_STACK_SPACE);
+
+ASSYM(PMAP_TRM_MIN_ADDRESS, PMAP_TRM_MIN_ADDRESS);
+ASSYM(TRAMP_COPYOUT_SZ, TRAMP_COPYOUT_SZ);
 
 #ifdef	HWPMC_HOOKS
 ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN);
 #endif
Index: head/sys/i386/i386/locore.s
===================================================================
--- head/sys/i386/i386/locore.s	(revision 332488)
+++ head/sys/i386/i386/locore.s	(revision 332489)
@@ -1,787 +1,461 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)locore.s	7.3 (Berkeley) 5/13/91
  * $FreeBSD$
  *
  *		originally from: locore.s, by William F. Jolitz
  *
  *		Substantially rewritten by David Greenman, Rod Grimes,
  *			Bruce Evans, Wolfgang Solfrank, Poul-Henning Kamp
  *			and many others.
  */
 
 #include "opt_bootp.h"
 #include "opt_nfsroot.h"
 #include "opt_pmap.h"
 
 #include <sys/reboot.h>
 
 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
 #include <machine/psl.h>
 #include <machine/pmap.h>
 #include <machine/specialreg.h>
 
 #include "assym.inc"
 
 /*
- *	XXX
- *
- * Note: This version greatly munged to avoid various assembler errors
- * that may be fixed in newer versions of gas. Perhaps newer versions
- * will have more pleasant appearance.
- */
-
-/*
  * PTmap is recursive pagemap at top of virtual address space.
  * Within PTmap, the page directory can be found (third indirection).
  */
 	.globl	PTmap,PTD,PTDpde
 	.set	PTmap,(PTDPTDI << PDRSHIFT)
 	.set	PTD,PTmap + (PTDPTDI * PAGE_SIZE)
 	.set	PTDpde,PTD + (PTDPTDI * PDESIZE)
 
 /*
- * Compiled KERNBASE location and the kernel load address
+ * Compiled KERNBASE location and the kernel load address, now identical.
  */
 	.globl	kernbase
 	.set	kernbase,KERNBASE
 	.globl	kernload
 	.set	kernload,KERNLOAD
 
 /*
  * Globals
  */
 	.data
 	ALIGN_DATA			/* just to be sure */
 
 	.space	0x2000			/* space for tmpstk - temporary stack */
 tmpstk:
 
 	.globl	bootinfo
 bootinfo:	.space	BOOTINFO_SIZE	/* bootinfo that we can handle */
 
-		.globl KERNend
-KERNend:	.long	0		/* phys addr end of kernel (just after bss) */
-physfree:	.long	0		/* phys addr of next free page */
-
-	.globl	IdlePTD
-IdlePTD:	.long	0		/* phys addr of kernel PTD */
-
-#if defined(PAE) || defined(PAE_TABLES)
-	.globl	IdlePDPT
-IdlePDPT:	.long	0		/* phys addr of kernel PDPT */
-#endif
-
-	.globl	KPTmap
-KPTmap:		.long	0		/* address of kernel page tables */
-
-	.globl	KPTphys
-KPTphys:	.long	0		/* phys addr of kernel page tables */
-
-	.globl	proc0kstack
-proc0kstack:	.long	0		/* address of proc 0 kstack space */
-p0kpa:		.long	0		/* phys addr of proc0's STACK */
-
-vm86phystk:	.long	0		/* PA of vm86/bios stack */
-
-	.globl	vm86paddr, vm86pa
-vm86paddr:	.long	0		/* address of vm86 region */
-vm86pa:		.long	0		/* phys addr of vm86 region */
-
-/**********************************************************************
- *
- * Some handy macros
- *
- */
-
-#define R(foo) ((foo)-KERNBASE)
-
-#define ALLOCPAGES(foo) \
-	movl	R(physfree), %esi ; \
-	movl	$((foo)*PAGE_SIZE), %eax ; \
-	addl	%esi, %eax ; \
-	movl	%eax, R(physfree) ; \
-	movl	%esi, %edi ; \
-	movl	$((foo)*PAGE_SIZE),%ecx ; \
-	xorl	%eax,%eax ; \
-	cld ; \
-	rep ; \
-	stosb
-
-/*
- * fillkpt
- *	eax = page frame address
- *	ebx = index into page table
- *	ecx = how many pages to map
- * 	base = base address of page dir/table
- *	prot = protection bits
- */
-#define	fillkpt(base, prot)		  \
-	shll	$PTESHIFT,%ebx		; \
-	addl	base,%ebx		; \
-	orl	$PG_V,%eax		; \
-	orl	prot,%eax		; \
-1:	movl	%eax,(%ebx)		; \
-	addl	$PAGE_SIZE,%eax		; /* increment physical address */ \
-	addl	$PTESIZE,%ebx		; /* next pte */ \
-	loop	1b
-
-/*
- * fillkptphys(prot)
- *	eax = physical address
- *	ecx = how many pages to map
- *	prot = protection bits
- */
-#define	fillkptphys(prot)		  \
-	movl	%eax, %ebx		; \
-	shrl	$PAGE_SHIFT, %ebx	; \
-	fillkpt(R(KPTphys), prot)
-
 	.text
 /**********************************************************************
  *
  * This is where the bootblocks start us, set the ball rolling...
  *
  */
 NON_GPROF_ENTRY(btext)
 
 /* Tell the bios to warmboot next time */
 	movw	$0x1234,0x472
 
 /* Set up a real frame in case the double return in newboot is executed. */
+	xorl	%ebp,%ebp
 	pushl	%ebp
 	movl	%esp, %ebp
 
 /* Don't trust what the BIOS gives for eflags. */
 	pushl	$PSL_KERNEL
 	popfl
 
 /*
  * Don't trust what the BIOS gives for %fs and %gs.  Trust the bootstrap
  * to set %cs, %ds, %es and %ss.
  */
 	mov	%ds, %ax
 	mov	%ax, %fs
 	mov	%ax, %gs
 
 /*
  * Clear the bss.  Not all boot programs do it, and it is our job anyway.
  *
  * XXX we don't check that there is memory for our bss and page tables
  * before using it.
  *
  * Note: we must be careful to not overwrite an active gdt or idt.  They
  * inactive from now until we switch to new ones, since we don't load any
  * more segment registers or permit interrupts until after the switch.
  */
-	movl	$R(end),%ecx
-	movl	$R(edata),%edi
+	movl	$end,%ecx
+	movl	$edata,%edi
 	subl	%edi,%ecx
 	xorl	%eax,%eax
 	cld
 	rep
 	stosb
 
 	call	recover_bootinfo
 
 /* Get onto a stack that we can trust. */
 /*
  * XXX this step is delayed in case recover_bootinfo needs to return via
  * the old stack, but it need not be, since recover_bootinfo actually
  * returns via the old frame.
  */
-	movl	$R(tmpstk),%esp
+	movl	$tmpstk,%esp
 
 	call	identify_cpu
-	call	create_pagetables
+	call	pmap_cold
 
-/*
- * If the CPU has support for VME, turn it on.
- */ 
-	testl	$CPUID_VME, R(cpu_feature)
-	jz	1f
-	movl	%cr4, %eax
-	orl	$CR4_VME, %eax
-	movl	%eax, %cr4
-1:
-
-/* Now enable paging */
-#if defined(PAE) || defined(PAE_TABLES)
-	movl	R(IdlePDPT), %eax
-	movl	%eax, %cr3
-	movl	%cr4, %edx
-	orl	$CR4_PAE, %edx
-	movl	%edx, %cr4
-#else
-	movl	R(IdlePTD), %eax
-	movl	%eax,%cr3		/* load ptd addr into mmu */
-#endif
-	movl	%cr0,%edx		/* get control word */
-	orl	$CR0_PE|CR0_PG,%edx	/* enable paging */
-	movl	%edx,%cr0		/* and let's page NOW! */
-
-	pushl	$begin			/* jump to high virtualized address */
-	ret
-
-begin:
-	/*
-	 * Now running relocated at KERNBASE where the system is linked to run.
-	 *
-	 * Remove the lowest part of the double mapping of low memory to get
-	 * some null pointer checks.
-	 */
-	movl	$0,PTD
-	movl	%eax,%cr3		/* invalidate TLB */
-
 	/* set up bootstrap stack */
 	movl	proc0kstack,%eax	/* location of in-kernel stack */
 
 	/*
 	 * Only use bottom page for init386().  init386() calculates the
 	 * PCB + FPU save area size and returns the true top of stack.
 	 */
 	leal	PAGE_SIZE(%eax),%esp
 
 	xorl	%ebp,%ebp		/* mark end of frames */
 
 	pushl	physfree		/* value of first for init386(first) */
 	call	init386			/* wire 386 chip for unix operation */
 
 	/*
 	 * Clean up the stack in a way that db_numargs() understands, so
 	 * that backtraces in ddb don't underrun the stack.  Traps for
 	 * inaccessible memory are more fatal than usual this early.
 	 */
 	addl	$4,%esp
 
 	/* Switch to true top of stack. */
 	movl	%eax,%esp
 
 	call	mi_startup		/* autoconfiguration, mountroot etc */
 	/* NOTREACHED */
 	addl	$0,%esp			/* for db_numargs() again */
 
 /**********************************************************************
  *
  * Recover the bootinfo passed to us from the boot program
  *
  */
 recover_bootinfo:
 	/*
 	 * This code is called in different ways depending on what loaded
 	 * and started the kernel.  This is used to detect how we get the
 	 * arguments from the other code and what we do with them.
 	 *
 	 * Old disk boot blocks:
 	 *	(*btext)(howto, bootdev, cyloffset, esym);
 	 *	[return address == 0, and can NOT be returned to]
 	 *	[cyloffset was not supported by the FreeBSD boot code
 	 *	 and always passed in as 0]
 	 *	[esym is also known as total in the boot code, and
 	 *	 was never properly supported by the FreeBSD boot code]
 	 *
 	 * Old diskless netboot code:
 	 *	(*btext)(0,0,0,0,&nfsdiskless,0,0,0);
 	 *	[return address != 0, and can NOT be returned to]
 	 *	If we are being booted by this code it will NOT work,
 	 *	so we are just going to halt if we find this case.
 	 *
 	 * New uniform boot code:
 	 *	(*btext)(howto, bootdev, 0, 0, 0, &bootinfo)
 	 *	[return address != 0, and can be returned to]
 	 *
 	 * There may seem to be a lot of wasted arguments in here, but
 	 * that is so the newer boot code can still load very old kernels
 	 * and old boot code can load new kernels.
 	 */
 
 	/*
 	 * The old style disk boot blocks fake a frame on the stack and
 	 * did an lret to get here.  The frame on the stack has a return
 	 * address of 0.
 	 */
 	cmpl	$0,4(%ebp)
 	je	olddiskboot
 
 	/*
 	 * We have some form of return address, so this is either the
 	 * old diskless netboot code, or the new uniform code.  That can
 	 * be detected by looking at the 5th argument, if it is 0
 	 * we are being booted by the new uniform boot code.
 	 */
 	cmpl	$0,24(%ebp)
 	je	newboot
 
 	/*
 	 * Seems we have been loaded by the old diskless boot code, we
 	 * don't stand a chance of running as the diskless structure
 	 * changed considerably between the two, so just halt.
 	 */
 	 hlt
 
 	/*
 	 * We have been loaded by the new uniform boot code.
 	 * Let's check the bootinfo version, and if we do not understand
 	 * it we return to the loader with a status of 1 to indicate this error
 	 */
 newboot:
 	movl	28(%ebp),%ebx		/* &bootinfo.version */
 	movl	BI_VERSION(%ebx),%eax
 	cmpl	$1,%eax			/* We only understand version 1 */
 	je	1f
 	movl	$1,%eax			/* Return status */
 	leave
 	/*
 	 * XXX this returns to our caller's caller (as is required) since
 	 * we didn't set up a frame and our caller did.
 	 */
 	ret
 
 1:
 	/*
 	 * If we have a kernelname copy it in
 	 */
 	movl	BI_KERNELNAME(%ebx),%esi
 	cmpl	$0,%esi
 	je	2f			/* No kernelname */
 	movl	$MAXPATHLEN,%ecx	/* Brute force!!! */
-	movl	$R(kernelname),%edi
+	movl	$kernelname,%edi
 	cmpb	$'/',(%esi)		/* Make sure it starts with a slash */
 	je	1f
 	movb	$'/',(%edi)
 	incl	%edi
 	decl	%ecx
 1:
 	cld
 	rep
 	movsb
 
 2:
 	/*
 	 * Determine the size of the boot loader's copy of the bootinfo
 	 * struct.  This is impossible to do properly because old versions
 	 * of the struct don't contain a size field and there are 2 old
 	 * versions with the same version number.
 	 */
 	movl	$BI_ENDCOMMON,%ecx	/* prepare for sizeless version */
 	testl	$RB_BOOTINFO,8(%ebp)	/* bi_size (and bootinfo) valid? */
 	je	got_bi_size		/* no, sizeless version */
 	movl	BI_SIZE(%ebx),%ecx
 got_bi_size:
 
 	/*
 	 * Copy the common part of the bootinfo struct
 	 */
 	movl	%ebx,%esi
-	movl	$R(bootinfo),%edi
+	movl	$bootinfo,%edi
 	cmpl	$BOOTINFO_SIZE,%ecx
 	jbe	got_common_bi_size
 	movl	$BOOTINFO_SIZE,%ecx
 got_common_bi_size:
 	cld
 	rep
 	movsb
 
 #ifdef NFS_ROOT
 #ifndef BOOTP_NFSV3
 	/*
 	 * If we have a nfs_diskless structure copy it in
 	 */
 	movl	BI_NFS_DISKLESS(%ebx),%esi
 	cmpl	$0,%esi
 	je	olddiskboot
-	movl	$R(nfs_diskless),%edi
+	movl	$nfs_diskless,%edi
 	movl	$NFSDISKLESS_SIZE,%ecx
 	cld
 	rep
 	movsb
-	movl	$R(nfs_diskless_valid),%edi
+	movl	$nfs_diskless_valid,%edi
 	movl	$1,(%edi)
 #endif
 #endif
 
 	/*
 	 * The old style disk boot.
 	 *	(*btext)(howto, bootdev, cyloffset, esym);
 	 * Note that the newer boot code just falls into here to pick
 	 * up howto and bootdev, cyloffset and esym are no longer used
 	 */
 olddiskboot:
 	movl	8(%ebp),%eax
-	movl	%eax,R(boothowto)
+	movl	%eax,boothowto
 	movl	12(%ebp),%eax
-	movl	%eax,R(bootdev)
+	movl	%eax,bootdev
 
 	ret
 
 
 /**********************************************************************
  *
  * Identify the CPU and initialize anything special about it
  *
  */
 identify_cpu:
 
 	/* Try to toggle alignment check flag; does not exist on 386. */
 	pushfl
 	popl	%eax
 	movl	%eax,%ecx
 	orl	$PSL_AC,%eax
 	pushl	%eax
 	popfl
 	pushfl
 	popl	%eax
 	xorl	%ecx,%eax
 	andl	$PSL_AC,%eax
 	pushl	%ecx
 	popfl
 
 	testl	%eax,%eax
 	jnz	try486
 
 	/* NexGen CPU does not have aligment check flag. */
 	pushfl
 	movl	$0x5555, %eax
 	xorl	%edx, %edx
 	movl	$2, %ecx
 	clc
 	divl	%ecx
 	jz	trynexgen
 	popfl
-	movl	$CPU_386,R(cpu)
+	movl	$CPU_386,cpu
 	jmp	3f
 
 trynexgen:
 	popfl
-	movl	$CPU_NX586,R(cpu)
-	movl	$0x4778654e,R(cpu_vendor)	# store vendor string
-	movl	$0x72446e65,R(cpu_vendor+4)
-	movl	$0x6e657669,R(cpu_vendor+8)
-	movl	$0,R(cpu_vendor+12)
+	movl	$CPU_NX586,cpu
+	movl	$0x4778654e,cpu_vendor		# store vendor string
+	movl	$0x72446e65,cpu_vendor+4
+	movl	$0x6e657669,cpu_vendor+8
+	movl	$0,cpu_vendor+12
 	jmp	3f
 
 try486:	/* Try to toggle identification flag; does not exist on early 486s. */
 	pushfl
 	popl	%eax
 	movl	%eax,%ecx
 	xorl	$PSL_ID,%eax
 	pushl	%eax
 	popfl
 	pushfl
 	popl	%eax
 	xorl	%ecx,%eax
 	andl	$PSL_ID,%eax
 	pushl	%ecx
 	popfl
 
 	testl	%eax,%eax
 	jnz	trycpuid
-	movl	$CPU_486,R(cpu)
+	movl	$CPU_486,cpu
 
 	/*
 	 * Check Cyrix CPU
 	 * Cyrix CPUs do not change the undefined flags following
 	 * execution of the divide instruction which divides 5 by 2.
 	 *
 	 * Note: CPUID is enabled on M2, so it passes another way.
 	 */
 	pushfl
 	movl	$0x5555, %eax
 	xorl	%edx, %edx
 	movl	$2, %ecx
 	clc
 	divl	%ecx
 	jnc	trycyrix
 	popfl
 	jmp	3f		/* You may use Intel CPU. */
 
 trycyrix:
 	popfl
 	/*
 	 * IBM Bluelighting CPU also doesn't change the undefined flags.
 	 * Because IBM doesn't disclose the information for Bluelighting
 	 * CPU, we couldn't distinguish it from Cyrix's (including IBM
 	 * brand of Cyrix CPUs).
 	 */
-	movl	$0x69727943,R(cpu_vendor)	# store vendor string
-	movl	$0x736e4978,R(cpu_vendor+4)
-	movl	$0x64616574,R(cpu_vendor+8)
+	movl	$0x69727943,cpu_vendor		# store vendor string
+	movl	$0x736e4978,cpu_vendor+4
+	movl	$0x64616574,cpu_vendor+8
 	jmp	3f
 
 trycpuid:	/* Use the `cpuid' instruction. */
 	xorl	%eax,%eax
 	cpuid					# cpuid 0
-	movl	%eax,R(cpu_high)		# highest capability
-	movl	%ebx,R(cpu_vendor)		# store vendor string
-	movl	%edx,R(cpu_vendor+4)
-	movl	%ecx,R(cpu_vendor+8)
-	movb	$0,R(cpu_vendor+12)
+	movl	%eax,cpu_high			# highest capability
+	movl	%ebx,cpu_vendor			# store vendor string
+	movl	%edx,cpu_vendor+4
+	movl	%ecx,cpu_vendor+8
+	movb	$0,cpu_vendor+12
 
 	movl	$1,%eax
 	cpuid					# cpuid 1
-	movl	%eax,R(cpu_id)			# store cpu_id
-	movl	%ebx,R(cpu_procinfo)		# store cpu_procinfo
-	movl	%edx,R(cpu_feature)		# store cpu_feature
-	movl	%ecx,R(cpu_feature2)		# store cpu_feature2
+	movl	%eax,cpu_id			# store cpu_id
+	movl	%ebx,cpu_procinfo		# store cpu_procinfo
+	movl	%edx,cpu_feature		# store cpu_feature
+	movl	%ecx,cpu_feature2		# store cpu_feature2
 	rorl	$8,%eax				# extract family type
 	andl	$15,%eax
 	cmpl	$5,%eax
 	jae	1f
 
 	/* less than Pentium; must be 486 */
-	movl	$CPU_486,R(cpu)
+	movl	$CPU_486,cpu
 	jmp	3f
 1:
 	/* a Pentium? */
 	cmpl	$5,%eax
 	jne	2f
-	movl	$CPU_586,R(cpu)
+	movl	$CPU_586,cpu
 	jmp	3f
 2:
 	/* Greater than Pentium...call it a Pentium Pro */
-	movl	$CPU_686,R(cpu)
+	movl	$CPU_686,cpu
 3:
-	ret
-
-
-/**********************************************************************
- *
- * Create the first page directory and its page tables.
- *
- */
-
-create_pagetables:
-
-/* Find end of kernel image (rounded up to a page boundary). */
-	movl	$R(_end),%esi
-
-/* Include symbols, if any. */
-	movl	R(bootinfo+BI_ESYMTAB),%edi
-	testl	%edi,%edi
-	je	over_symalloc
-	movl	%edi,%esi
-	movl	$KERNBASE,%edi
-	addl	%edi,R(bootinfo+BI_SYMTAB)
-	addl	%edi,R(bootinfo+BI_ESYMTAB)
-over_symalloc:
-
-/* If we are told where the end of the kernel space is, believe it. */
-	movl	R(bootinfo+BI_KERNEND),%edi
-	testl	%edi,%edi
-	je	no_kernend
-	movl	%edi,%esi
-no_kernend:
-
-	addl	$PDRMASK,%esi		/* Play conservative for now, and */
-	andl	$~PDRMASK,%esi		/* ... round up to PDR boundary */
-	movl	%esi,R(KERNend)		/* save end of kernel */
-	movl	%esi,R(physfree)	/* next free page is at end of kernel */
-
-/* Allocate Kernel Page Tables */
-	ALLOCPAGES(NKPT)
-	movl	%esi,R(KPTphys)
-	addl	$(KERNBASE-(KPTDI<<(PDRSHIFT-PAGE_SHIFT+PTESHIFT))),%esi
-	movl	%esi,R(KPTmap)
-
-/* Allocate Page Table Directory */
-#if defined(PAE) || defined(PAE_TABLES)
-	/* XXX only need 32 bytes (easier for now) */
-	ALLOCPAGES(1)
-	movl	%esi,R(IdlePDPT)
-#endif
-	ALLOCPAGES(NPGPTD)
-	movl	%esi,R(IdlePTD)
-
-/* Allocate KSTACK */
-	ALLOCPAGES(TD0_KSTACK_PAGES)
-	movl	%esi,R(p0kpa)
-	addl	$KERNBASE, %esi
-	movl	%esi, R(proc0kstack)
-
-	ALLOCPAGES(1)			/* vm86/bios stack */
-	movl	%esi,R(vm86phystk)
-
-	ALLOCPAGES(3)			/* pgtable + ext + IOPAGES */
-	movl	%esi,R(vm86pa)
-	addl	$KERNBASE, %esi
-	movl	%esi, R(vm86paddr)
-
-/*
- * Enable PSE and PGE.
- */
-#ifndef DISABLE_PSE
-	testl	$CPUID_PSE, R(cpu_feature)
-	jz	1f
-	movl	$PG_PS, R(pseflag)
-	movl	%cr4, %eax
-	orl	$CR4_PSE, %eax
-	movl	%eax, %cr4
-1:
-#endif
-#ifndef DISABLE_PG_G
-	testl	$CPUID_PGE, R(cpu_feature)
-	jz	2f
-	movl	$PG_G, R(pgeflag)
-	movl	%cr4, %eax
-	orl	$CR4_PGE, %eax
-	movl	%eax, %cr4
-2:
-#endif
-
-/*
- * Initialize page table pages mapping physical address zero through the
- * (physical) end of the kernel.  Many of these pages must be reserved,
- * and we reserve them all and map them linearly for convenience.  We do
- * this even if we've enabled PSE above; we'll just switch the corresponding
- * kernel PDEs before we turn on paging.
- *
- * XXX: We waste some pages here in the PSE case!
- *
- * This and all other page table entries allow read and write access for
- * various reasons.  Kernel mappings never have any access restrictions.
- */
-	xorl	%eax, %eax
-	movl	R(KERNend),%ecx
-	shrl	$PAGE_SHIFT,%ecx
-	fillkptphys($PG_RW)
-
-/* Map page table pages. */
-	movl	R(KPTphys),%eax
-	movl	$NKPT,%ecx
-	fillkptphys($PG_RW)
-
-/* Map page directory. */
-#if defined(PAE) || defined(PAE_TABLES)
-	movl	R(IdlePDPT), %eax
-	movl	$1, %ecx
-	fillkptphys($PG_RW)
-#endif
-
-	movl	R(IdlePTD), %eax
-	movl	$NPGPTD, %ecx
-	fillkptphys($PG_RW)
-
-/* Map proc0's KSTACK in the physical way ... */
-	movl	R(p0kpa), %eax
-	movl	$(TD0_KSTACK_PAGES), %ecx
-	fillkptphys($PG_RW)
-
-/* Map ISA hole */
-	movl	$ISA_HOLE_START, %eax
-	movl	$ISA_HOLE_LENGTH>>PAGE_SHIFT, %ecx
-	fillkptphys($PG_RW)
-
-/* Map space for the vm86 region */
-	movl	R(vm86phystk), %eax
-	movl	$4, %ecx
-	fillkptphys($PG_RW)
-
-/* Map page 0 into the vm86 page table */
-	movl	$0, %eax
-	movl	$0, %ebx
-	movl	$1, %ecx
-	fillkpt(R(vm86pa), $PG_RW|PG_U)
-
-/* ...likewise for the ISA hole */
-	movl	$ISA_HOLE_START, %eax
-	movl	$ISA_HOLE_START>>PAGE_SHIFT, %ebx
-	movl	$ISA_HOLE_LENGTH>>PAGE_SHIFT, %ecx
-	fillkpt(R(vm86pa), $PG_RW|PG_U)
-
-/*
- * Create an identity mapping for low physical memory, including the kernel.
- * This is only used to map the 2 instructions for jumping to 'begin' in
- * locore (we map everything to avoid having to determine where these
- * instructions are).  ACPI resume will transiently restore the first PDE in
- * this mapping (and depend on this PDE's page table created here not being
- * destroyed).  See pmap_bootstrap() for more details.
- *
- * Note:  There are errata concerning large pages and physical address zero,
- * so a PG_PS mapping should not be used for PDE 0.  Our double mapping
- * avoids this automatically by not using PG_PS for PDE #KPDI so that PAT
- * bits can be set at the page level for i/o pages below 1 MB.
- */
-	movl	R(KPTphys), %eax
-	xorl	%ebx, %ebx
-	movl	$NKPT, %ecx
-	fillkpt(R(IdlePTD), $PG_RW)
-
-/*
- * Install PDEs for PTs covering enough kva to bootstrap.  Then for the PSE
- * case, replace the PDEs whose coverage is strictly within the kernel
- * (between KERNLOAD (rounded up) and KERNend) by large-page PDEs.
- */
-	movl	R(KPTphys), %eax
-	movl	$KPTDI, %ebx
-	movl	$NKPT, %ecx
-	fillkpt(R(IdlePTD), $PG_RW)
-	cmpl	$0,R(pseflag)
-	je	done_pde
-
-	movl	R(KERNend), %ecx
-	movl	$(KERNLOAD + PDRMASK) & ~PDRMASK, %eax
-	subl	%eax, %ecx
-	shrl	$PDRSHIFT, %ecx
-	movl	$KPTDI + ((KERNLOAD + PDRMASK) >> PDRSHIFT), %ebx
-	shll	$PDESHIFT, %ebx
-	addl	R(IdlePTD), %ebx
-	orl	$(PG_V|PG_RW|PG_PS), %eax
-1:	movl	%eax, (%ebx)
-	addl	$(1 << PDRSHIFT), %eax
-	addl	$PDESIZE, %ebx
-	loop	1b
-
-done_pde:
-/* install a pde recursively mapping page directory as a page table */
-	movl	R(IdlePTD), %eax
-	movl	$PTDPTDI, %ebx
-	movl	$NPGPTD,%ecx
-	fillkpt(R(IdlePTD), $PG_RW)
-
-#if defined(PAE) || defined(PAE_TABLES)
-	movl	R(IdlePTD), %eax
-	xorl	%ebx, %ebx
-	movl	$NPGPTD, %ecx
-	fillkpt(R(IdlePDPT), $0x0)
-#endif
-
 	ret
 
 #ifdef XENHVM
 /* Xen Hypercall page */
 	.text
 .p2align PAGE_SHIFT, 0x90	/* Hypercall_page needs to be PAGE aligned */
 
 NON_GPROF_ENTRY(hypercall_page)
 	.skip	0x1000, 0x90	/* Fill with "nop"s */
 #endif
Index: head/sys/i386/i386/machdep.c
===================================================================
--- head/sys/i386/i386/machdep.c	(revision 332488)
+++ head/sys/i386/i386/machdep.c	(revision 332489)
@@ -1,3044 +1,3247 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
+ * Copyright (c) 2018 The FreeBSD Foundation
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
+ * Portions of this software were developed by A. Joseph Koshy under
+ * sponsorship from the FreeBSD Foundation and Google, Inc.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_atpic.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_perfmon.h"
 #include "opt_platform.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
-#ifdef SMP
 #include <sys/smp.h>
-#endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <isa/rtc.h>
 
 #include <net/netisr.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
+#include <machine/sysarch.h>
 #include <machine/trap.h>
 #include <machine/vm86.h>
 #include <x86/init.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
 
 #ifdef DEV_APIC
 #include <x86/apicvar.h>
 #endif
 
 #ifdef DEV_ISA
 #include <x86/isa/icu.h>
 #endif
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
-extern register_t init386(int first);
-extern void dblfault_handler(void);
+register_t init386(int first);
+void dblfault_handler(void);
 
 static void cpu_startup(void *);
 static void fpstate_drop(struct thread *td);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpusave, size_t xfpusave_len);
 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpustate, size_t xfpustate_len);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 int cold = 1;
 
 #ifdef COMPAT_43
 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
 #ifdef PAE
 FEATURE(pae, "Physical Address Extensions");
 #endif
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
 #define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
- /* Default init_ops implementation. */
- struct init_ops init_ops = {
+extern char start_exceptions[], end_exceptions[];
+
+extern struct sysentvec elf32_freebsd_sysvec;
+
+/* Default init_ops implementation. */
+struct init_ops init_ops = {
 	.early_clock_source_init =	i8254_init,
 	.early_delay =			i8254_delay,
 #ifdef DEV_APIC
 	.msi_init =			msi_init,
 #endif
- };
+};
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge.
 	 */
 	sysenv = kern_getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = kern_getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)vm_free_count()))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)vm_free_count()),
 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by call
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 #ifdef COMPAT_43
 static void
 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 		sf.sf_addr = 0;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	if (p->p_sysent->sv_sigcode_base != 0) {
 		regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
 		    szosigcode;
 	} else {
 		/* a.out sysentvec does not use shared page */
 		regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode;
 	}
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
 	bzero(sf.sf_uc.uc_mcontext.__spare__,
 	    sizeof(sf.sf_uc.uc_mcontext.__spare__));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe4));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe4 *)regs->tf_esp - 1;
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = ksi->ksi_code;
 		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
 	    szfreebsd4_sigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	struct segment_descriptor *sdp;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu);
 		xfpusave = __builtin_alloca(xfpusave_len);
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	/*
 	 * Unconditionally fill the fsbase and gsbase into the mcontext.
 	 */
 	sdp = &td->td_pcb->pcb_fsd;
 	sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare2,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_esp - 128;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned int)sp & ~0x3F);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(struct sigframe);
 
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
 	    != 0)) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = p->p_sysent->sv_sigcode_base;
 	if (regs->tf_eip == 0)
 		regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 #ifdef COMPAT_43
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct osigcontext sc;
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 
 #if defined(COMPAT_43)
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
 	    SIGPROCMASK_OLD);
 	return (EJUSTRETURN);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_sigreturn_args /* {
 		const ucontext4 *sigcntxp;
 	} */ *uap;
 {
 	struct ucontext4 uc;
 	struct trapframe *regs;
 	struct ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct proc *p;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	p = td->td_proc;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
 		    td->td_name, ucp->uc_mcontext.mc_flags);
 		return (EINVAL);
 	}
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 			uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 			xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 			if (xfpustate_len > cpu_max_ext_state_size -
 			    sizeof(union savefpu)) {
 				uprintf(
 			    "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 				    p->p_pid, td->td_name, xfpustate_len);
 				return (EINVAL);
 			}
 			xfpustate = __builtin_alloca(xfpustate_len);
 			error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
 			    xfpustate, xfpustate_len);
 			if (error != 0) {
 				uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 				    p->p_pid, td->td_name);
 				return (error);
 			}
 		} else {
 			xfpustate = NULL;
 			xfpustate_len = 0;
 		}
 		ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate,
 		    xfpustate_len);
 		if (ret != 0)
 			return (ret);
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
+#ifdef COMPAT_43
+static void
+setup_priv_lcall_gate(struct proc *p)
+{
+	struct i386_ldt_args uap;
+	union descriptor desc;
+	u_int lcall_addr;
+
+	bzero(&uap, sizeof(uap));
+	uap.start = 0;
+	uap.num = 1;
+	lcall_addr = p->p_sysent->sv_psstrings - sz_lcall_tramp;
+	bzero(&desc, sizeof(desc));
+	desc.sd.sd_type = SDT_MEMERA;
+	desc.sd.sd_dpl = SEL_UPL;
+	desc.sd.sd_p = 1;
+	desc.sd.sd_def32 = 1;
+	desc.sd.sd_gran = 1;
+	desc.sd.sd_lolimit = 0xffff;
+	desc.sd.sd_hilimit = 0xf;
+	desc.sd.sd_lobase = lcall_addr;
+	desc.sd.sd_hibase = lcall_addr >> 24;
+	i386_set_ldt(curthread, &uap, &desc);
+}
+#endif
+
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs;
 	struct pcb *pcb;
 	register_t saved_eflags;
 
 	regs = td->td_frame;
 	pcb = td->td_pcb;
 
 	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
 	mtx_lock_spin(&dt_lock);
-	if (td->td_proc->p_md.md_ldt)
+	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 	else
 		mtx_unlock_spin(&dt_lock);
+
+#ifdef COMPAT_43
+	if (td->td_proc->p_sysent->sv_psstrings !=
+	    elf32_freebsd_sysvec.sv_psstrings)
+		setup_priv_lcall_gate(td->td_proc);
+#endif
   
 	/*
 	 * Reset the fs and gs bases.  The values from the old address
 	 * space do not make sense for the new program.  In particular,
 	 * gsbase might be the TLS base for the old program but the new
 	 * program has no TLS now.
 	 */
 	set_fsbase(td, 0);
 	set_gsbase(td, 0);
 
 	/* Make sure edx is 0x0 on entry. Linux binaries depend on it. */
 	saved_eflags = regs->tf_eflags & PSL_T;
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = imgp->entry_addr;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | saved_eflags;
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = imgp->ps_strings;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == curpcb) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
 		pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 	 *
 	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
 	 * bit to control the trap, because setting the CR0_EM bit does
 	 * not cause WAIT instructions to trap.  It's important to trap
 	 * WAIT instructions - otherwise the "wait" variants of no-wait
 	 * control instructions would degenerate to the "no-wait" variants
 	 * after FP context switches but work correctly otherwise.  It's
 	 * particularly important to trap WAITs when there is no NPX -
 	 * otherwise the "wait" variants would always degenerate.
 	 *
 	 * Try setting CR0_NE to get correct error reporting on 486DX's.
 	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 
 static char bootmethod[16] = "BIOS";
 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
     "System firmware boot method");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 
-union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
-union descriptor ldt[NLDT];		/* local descriptor table */
+struct mtx dt_lock;			/* lock for GDT and LDT */
+
+union descriptor gdt0[NGDT];	/* initial global descriptor table */
+union descriptor *gdt = gdt0;	/* global descriptor table */
+
+union descriptor *ldt;		/* local descriptor table */
+
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
-struct region_descriptor r_gdt, r_idt;	/* table descriptors */
-struct mtx dt_lock;			/* lock for GDT and LDT */
 
-static struct i386tss dblfault_tss;
-static char dblfault_stack[PAGE_SIZE];
+static struct i386tss *dblfault_tss;
+static char *dblfault_stack;
 
-extern  vm_offset_t	proc0kstack;
+static struct i386tss common_tss0;
 
+vm_offset_t proc0kstack;
 
 /*
  * software prototypes -- in more palatable form.
  *
  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPRIV_SEL	1 SMP Per-Processor Private Data Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUFS_SEL	2 %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS_SEL	3 %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	6 Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	.ssd_base = 0x400,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {
 	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GLDT_SEL	10 LDT Descriptor */
-{	.ssd_base = (int) ldt,
-	.ssd_limit = sizeof(ldt)-1,
+{	.ssd_base = 0,
+	.ssd_limit = sizeof(union descriptor) * NLDT - 1,
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 User LDT Descriptor per process */
-{	.ssd_base = (int) ldt,
+{	.ssd_base = 0,
 	.ssd_limit = (512 * sizeof(union descriptor)-1),
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPANIC_SEL	12 Panic Tss Descriptor */
-{	.ssd_base = (int) &dblfault_tss,
+{	.ssd_base = 0,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GNDIS_SEL	18 NDIS Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 };
 
+uintptr_t setidt_disp;
+
 void
-setidt(idx, func, typ, dpl, selec)
-	int idx;
-	inthand_t *func;
-	int typ;
-	int dpl;
-	int selec;
+setidt(int idx, inthand_t *func, int typ, int dpl, int selec)
 {
+	uintptr_t off;
+
+	off = func != NULL ? (uintptr_t)func + setidt_disp : 0;
+	setidt_nodisp(idx, off, typ, dpl, selec);
+}
+
+void
+setidt_nodisp(int idx, uintptr_t off, int typ, int dpl, int selec)
+{
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
-	ip->gd_looffset = (int)func;
+	ip->gd_looffset = off;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
-	ip->gd_hioffset = ((int)func)>>16 ;
+	ip->gd_hioffset = ((u_int)off) >> 16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret),
 #endif
 #ifdef XENHVM
 	IDTVEC(xen_intr_upcall),
 #endif
-	IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
+	IDTVEC(int0x80_syscall);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
-	uintptr_t func;
+	uintptr_t func, func_trm;
+	bool trm;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
-		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
-		if (func != (uintptr_t)&IDTVEC(rsvd)) {
-			db_printf("%3d\t", idx);
-			db_printsym(func, DB_STGY_PROC);
-			db_printf("\n");
+		if (ip->gd_type == SDT_SYSTASKGT) {
+			db_printf("%3d\t<TASK>\n", idx);
+		} else {
+			func = (ip->gd_hioffset << 16 | ip->gd_looffset);
+			if (func >= PMAP_TRM_MIN_ADDRESS) {
+				func_trm = func;
+				func -= setidt_disp;
+				trm = true;
+			} else
+				trm = false;
+			if (func != (uintptr_t)&IDTVEC(rsvd)) {
+				db_printf("%3d\t", idx);
+				db_printsym(func, DB_STGY_PROC);
+				if (trm)
+					db_printf(" (trampoline %#x)",
+					    func_trm);
+				db_printf("\n");
+			}
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	uint64_t idtr, gdtr;
 
 	idtr = ridt();
 	db_printf("idtr\t0x%08x/%04x\n",
 	    (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
 	gdtr = rgdt();
 	db_printf("gdtr\t0x%08x/%04x\n",
 	    (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
 	db_printf("ldtr\t0x%04x\n", rldt());
 	db_printf("tr\t0x%04x\n", rtr());
 	db_printf("cr0\t0x%08x\n", rcr0());
 	db_printf("cr2\t0x%08x\n", rcr2());
 	db_printf("cr3\t0x%08x\n", rcr3());
 	db_printf("cr4\t0x%08x\n", rcr4());
 	if (rcr4() & CR4_XSAVE)
 		db_printf("xcr0\t0x%016llx\n", rxcr(0));
 	if (amd_feature & (AMDID_NX | AMDID_LM))
 		db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER));
 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
 		db_printf("FEATURES_CTL\t0x%016llx\n",
 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
 	if ((cpu_vendor_id == CPU_VENDOR_INTEL ||
 	    cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6)
 		db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR));
 	if (cpu_feature & CPUID_PAT)
 		db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT));
 }
 
 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
 {
 
 	db_printf("dr0\t0x%08x\n", rdr0());
 	db_printf("dr1\t0x%08x\n", rdr1());
 	db_printf("dr2\t0x%08x\n", rdr2());
 	db_printf("dr3\t0x%08x\n", rdr3());
 	db_printf("dr6\t0x%08x\n", rdr6());
 	db_printf("dr7\t0x%08x\n", rdr7());	
 }
+
+DB_SHOW_COMMAND(frame, db_show_frame)
+{
+	struct trapframe *frame;
+
+	frame = have_addr ? (struct trapframe *)addr : curthread->td_frame;
+	printf("ss %#x esp %#x efl %#x cs %#x eip %#x\n",
+	    frame->tf_ss, frame->tf_esp, frame->tf_eflags, frame->tf_cs,
+	    frame->tf_eip);
+	printf("err %#x trapno %d\n", frame->tf_err, frame->tf_trapno);
+	printf("ds %#x es %#x fs %#x\n",
+	    frame->tf_ds, frame->tf_es, frame->tf_fs);
+	printf("eax %#x ecx %#x edx %#x ebx %#x\n",
+	    frame->tf_eax, frame->tf_ecx, frame->tf_edx, frame->tf_ebx);
+	printf("ebp %#x esi %#x edi %#x\n",
+	    frame->tf_ebp, frame->tf_esi, frame->tf_edi);
+
+}
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 	
 	if (length == 0)
 		return (1);
 
 #ifndef PAE
 	if (base > 0xffffffff) {
 		printf("%uK of memory above 4GB ignored\n",
 		    (u_int)(length / 1024));
 		return (1);
 	}
 #endif
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 */
 	insert_idx = physmap_idx + 2;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYSMAP_SIZE) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = physmap_idx; i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 static int
 add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
 {
 	if (boothowto & RB_VERBOSE)
 		printf("SMAP type=%02x base=%016llx len=%016llx\n",
 		    smap->type, smap->base, smap->length);
 
 	if (smap->type != SMAP_TYPE_MEMORY)
 		return (1);
 
 	return (add_physmap_entry(smap->base, smap->length, physmap,
 	    physmap_idxp));
 }
 
 static void
 add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	struct bios_smap *smap, *smapend;
 	u_int32_t smapsize;
 	/*
 	 * Memory map from INT 15:E820.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes SMAP.
 	 */
 	smapsize = *((u_int32_t *)smapbase - 1);
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++)
 		if (!add_smap_entry(smap, physmap, physmap_idxp))
 			break;
 }
 
 static void
 basemem_setup(void)
 {
-	vm_paddr_t pa;
 	pt_entry_t *pte;
 	int i;
 
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
-	 * XXX if biosbasemem is now < 640, there is a `hole'
-	 * between the end of base memory and the start of
-	 * ISA memory.  The hole may be empty or it may
-	 * contain BIOS code or data.  Map it read/write so
-	 * that the BIOS can write to it.  (Memory from 0 to
-	 * the physical end of the kernel is mapped read-only
-	 * to begin with and then parts of it are remapped.
-	 * The parts that aren't remapped form holes that
-	 * remain read-only and are unused by the kernel.
-	 * The base memory area is below the physical end of
-	 * the kernel and right now forms a read-only hole.
-	 * The part of it from PAGE_SIZE to
-	 * (trunc_page(biosbasemem * 1024) - 1) will be
-	 * remapped and used by the kernel later.)
-	 *
-	 * This code is similar to the code used in
-	 * pmap_mapdev, but since no memory needs to be
-	 * allocated we simply change the mapping.
-	 */
-	for (pa = trunc_page(basemem * 1024);
-	     pa < ISA_HOLE_START; pa += PAGE_SIZE)
-		pmap_kenter(KERNBASE + pa, pa);
-
-	/*
 	 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 	 * the vm86 page table so that vm86 can scribble on them using
 	 * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 	 * page 0, at least as initialized here?
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 }
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(int first)
 {
 	int has_smap, off, physmap_idx, pa_indx, da_indx;
 	u_long memtest;
 	vm_paddr_t physmap[PHYSMAP_SIZE];
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size, physmem_tunable;
 	int hasbrokenint12, i, res;
 	u_int extmem;
 	struct vm86frame vmf;
 	struct vm86context vmc;
 	vm_paddr_t pa;
 	struct bios_smap *smap, *smapbase;
 	caddr_t kmdp;
 
 	has_smap = 0;
 	bzero(&vmf, sizeof(vmf));
 	bzero(physmap, sizeof(physmap));
 	basemem = 0;
 
 	/*
 	 * Check if the loader supplied an SMAP memory map.  If so,
 	 * use that and do not make any VM86 calls.
 	 */
 	physmap_idx = 0;
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf32 kernel");
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase != NULL) {
 		add_smap_entries(smapbase, physmap, &physmap_idx);
 		has_smap = 1;
 		goto have_smap;
 	}
 
 	/*
 	 * Some newer BIOSes have a broken INT 12H implementation
 	 * which causes a kernel panic immediately.  In this case, we
 	 * need use the SMAP to determine the base memory size.
 	 */
 	hasbrokenint12 = 0;
 	TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
 	if (hasbrokenint12 == 0) {
 		/* Use INT12 to determine base memory size. */
 		vm86_intcall(0x12, &vmf);
 		basemem = vmf.vmf_ax;
 		basemem_setup();
 	}
 
 	/*
 	 * Fetch the memory map with INT 15:E820.  Map page 1 R/W into
 	 * the kernel page table so we can use it as a buffer.  The
 	 * kernel will unmap this page later.
 	 */
-	pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
 	vmc.npages = 0;
-	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
+	smap = (void *)vm86_addpage(&vmc, 1, PMAP_MAP_LOW + ptoa(1));
 	res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 	KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
 
 	vmf.vmf_ebx = 0;
 	do {
 		vmf.vmf_eax = 0xE820;
 		vmf.vmf_edx = SMAP_SIG;
 		vmf.vmf_ecx = sizeof(struct bios_smap);
 		i = vm86_datacall(0x15, &vmf, &vmc);
 		if (i || vmf.vmf_eax != SMAP_SIG)
 			break;
 		has_smap = 1;
 		if (!add_smap_entry(smap, physmap, &physmap_idx))
 			break;
 	} while (vmf.vmf_ebx != 0);
 
 have_smap:
 	/*
 	 * If we didn't fetch the "base memory" size from INT12,
 	 * figure it out from the SMAP (or just guess).
 	 */
 	if (basemem == 0) {
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (physmap[i] == 0x00000000) {
 				basemem = physmap[i + 1] / 1024;
 				break;
 			}
 		}
 
 		/* XXX: If we couldn't find basemem from SMAP, just guess. */
 		if (basemem == 0)
 			basemem = 640;
 		basemem_setup();
 	}
 
 	if (physmap[1] != 0)
 		goto physmap_done;
 
 	/*
 	 * If we failed to find an SMAP, figure out the extended
 	 * memory size.  We will then build a simple memory map with
 	 * two segments, one for "base memory" and the second for
 	 * "extended memory".  Note that "extended memory" starts at a
 	 * physical address of 1MB and that both basemem and extmem
 	 * are in units of 1KB.
 	 *
 	 * First, try to fetch the extended memory size via INT 15:E801.
 	 */
 	vmf.vmf_ax = 0xE801;
 	if (vm86_intcall(0x15, &vmf) == 0) {
 		extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 	} else {
 		/*
 		 * If INT15:E801 fails, this is our last ditch effort
 		 * to determine the extended memory size.  Currently
 		 * we prefer the RTC value over INT15:88.
 		 */
 #if 0
 		vmf.vmf_ah = 0x88;
 		vm86_intcall(0x15, &vmf);
 		extmem = vmf.vmf_ax;
 #else
 		extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 #endif
 	}
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 * there's 16MB of memory - this really confuses people that
 	 * are trying to use bus mastering ISA controllers with the
 	 * "16MB limit"; they only have 16MB, but the remapping puts
 	 * them beyond the limit.
 	 *
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 		extmem = 15 * 1024;
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 physmap_done:
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	alloc_ap_trampoline(physmap, &physmap_idx);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 *
 	 * This is especially confusing when it is much larger than the
 	 * memory size and is displayed as "realmem".
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
 	 * the amount of memory in the system.
 	 */
 	if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	/*
 	 * By default enable the memory test on real hardware, and disable
 	 * it if we appear to be running in a VM.  This avoids touching all
 	 * pages unnecessarily, which doesn't matter on real hardware but is
 	 * bad for shared VM hosts.  Use a general name so that
 	 * one could eventually do more with the code than just disable it.
 	 */
 	memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP3;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR3;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= KERNLOAD && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa;	/* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 	
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 }
 
 static void
 i386_kdb_init(void)
 {
 #ifdef DDB
 	db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab);
 #endif
 	kdb_init();
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 }
 
+static void
+fixup_idt(void)
+{
+	struct gate_descriptor *ip;
+	uintptr_t off;
+	int x;
+
+	for (x = 0; x < NIDT; x++) {
+		ip = &idt[x];
+		if (ip->gd_type != SDT_SYS386IGT &&
+		    ip->gd_type != SDT_SYS386TGT)
+			continue;
+		off = ip->gd_looffset + (((u_int)ip->gd_hioffset) << 16);
+		KASSERT(off >= (uintptr_t)start_exceptions &&
+		    off < (uintptr_t)end_exceptions,
+		    ("IDT[%d] type %d off %#x", x, ip->gd_type, off));
+		off += setidt_disp;
+		MPASS(off >= PMAP_TRM_MIN_ADDRESS &&
+		    off < PMAP_TRM_MAX_ADDRESS);
+		ip->gd_looffset = off;
+		ip->gd_hioffset = off >> 16;
+	}
+}
+
+static void
+i386_setidt1(void)
+{
+	int x;
+
+	/* exceptions */
+	for (x = 0; x < NIDT; x++)
+		setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL,
+		    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL,
+	    SEL_KPL));
+	setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT,
+	    SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall),
+	    SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+#ifdef KDTRACE_HOOKS
+	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret),
+	    SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif
+#ifdef XENHVM
+	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall),
+	    SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif
+}
+
+static void
+i386_setidt2(void)
+{
+
+	setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+}
+
+#if defined(DEV_ISA) && !defined(DEV_ATPIC)
+static void
+i386_setidt3(void)
+{
+
+	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint),
+	    SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint),
+	    SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+}
+#endif
+
 register_t
 init386(int first)
 {
-	struct gate_descriptor *gdp;
+	struct region_descriptor r_gdt, r_idt;	/* table descriptors */
 	int gsel_tss, metadata_missing, x, pa;
 	struct pcpu *pc;
 	struct xstate_hdr *xhdr;
+	vm_offset_t addend;
 	int late_console;
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_kstack_pages = TD0_KSTACK_PAGES;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
-	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
-		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
-		preload_bootstrap_relocate(KERNBASE);
+		metadata_missing = 0;
+		addend = (vm_paddr_t)bootinfo.bi_modulep < KERNBASE ?
+		    PMAP_MAP_LOW : 0;
+		preload_metadata = (caddr_t)bootinfo.bi_modulep + addend;
+		preload_bootstrap_relocate(addend);
 	} else {
 		metadata_missing = 1;
 	}
 
-	if (bootinfo.bi_envp != 0)
-		init_static_kenv((char *)bootinfo.bi_envp + KERNBASE, 0);
-	else
+	if (bootinfo.bi_envp != 0) {
+		addend = (vm_paddr_t)bootinfo.bi_envp < KERNBASE ?
+		    PMAP_MAP_LOW : 0;
+		init_static_kenv((char *)bootinfo.bi_envp + addend, 0);
+	} else {
 		init_static_kenv(NULL, 0);
+	}
 
 	identify_hypervisor();
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * Make gdt memory segments.  All segments cover the full 4GB
 	 * of address space and permissions are enforced at page level.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 
 	pc = &__pcpu[0];
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
-	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
-	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
+	gdt_segs[GPRIV_SEL].ssd_base = (int)pc;
+	gdt_segs[GPROC0_SEL].ssd_base = (int)&common_tss0;
 
 	for (x = 0; x < NGDT; x++)
-		ssdtosd(&gdt_segs[x], &gdt[x].sd);
+		ssdtosd(&gdt_segs[x], &gdt0[x].sd);
 
-	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
-	r_gdt.rd_base =  (int) gdt;
+	r_gdt.rd_limit = NGDT * sizeof(gdt0[0]) - 1;
+	r_gdt.rd_base =  (int)gdt0;
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
-		pmap_kenter(pa + KERNBASE, pa);
-	dpcpu_init((void *)(first + KERNBASE), 0);
+		pmap_kenter(pa, pa);
+	dpcpu_init((void *)first, 0);
 	first += DPCPU_SIZE;
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	/* Non-late cninit() and printf() can be moved up to here. */
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
-	/* make ldt memory segments */
-	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
-	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
-	for (x = 0; x < nitems(ldt_segs); x++)
-		ssdtosd(&ldt_segs[x], &ldt[x].sd);
+	i386_setidt1();
 
-	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
-	lldt(_default_ldt);
-	PCPU_SET(currentldt, _default_ldt);
-
-	/* exceptions */
-	for (x = 0; x < NIDT; x++)
-		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
-		    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
- 	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
-	    , GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
-	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
- 	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-#ifdef KDTRACE_HOOKS
-	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-#endif
-#ifdef XENHVM
-	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-#endif
-
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the clock before the console so that console
 	 * initialization can use DELAY().
 	 */
 	clock_init();
 
 	finishidentcpu();	/* Final stage of CPU initialization */
-	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
+	i386_setidt2();
 	initializecpu();	/* Initialize CPU registers */
 	initializecpucache();
 
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
-	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
-	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
-	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
-	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
-#if defined(PAE) || defined(PAE_TABLES)
-	dblfault_tss.tss_cr3 = (int)IdlePDPT;
-#else
-	dblfault_tss.tss_cr3 = (int)IdlePTD;
-#endif
-	dblfault_tss.tss_eip = (int)dblfault_handler;
-	dblfault_tss.tss_eflags = PSL_KERNEL;
-	dblfault_tss.tss_ds = dblfault_tss.tss_es =
-	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
-	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
-	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
-	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
-
 	/* Initialize the tss (except for the final esp0) early for vm86. */
-	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
-	    thread0.td_kstack_pages * PAGE_SIZE - 16);
-	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
+	common_tss0.tss_esp0 = thread0.td_kstack + thread0.td_kstack_pages *
+	    PAGE_SIZE - VM86_STACK_SPACE;
+	common_tss0.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
+	common_tss0.tss_ioopt = sizeof(struct i386tss) << 16;
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
-	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	/* Initialize the PIC early for vm86 calls. */
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
-	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
-	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
-	    GSEL(GCODE_SEL, SEL_KPL));
+	i386_setidt3();
 #endif
 #endif
 
 	/*
 	 * The console and kdb should be initialized even earlier than here,
 	 * but some console drivers don't work until after getmemsize().
 	 * Default to late console initialization to support these drivers.
 	 * This loses mainly printf()s in getmemsize() and early debugging.
 	 */
 	late_console = 1;
 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
 	if (!late_console) {
 		cninit();
 		i386_kdb_init();
 	}
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	if (late_console)
 		cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 	if (late_console)
 		i386_kdb_init();
 
 	msgbufinit(msgbufp, msgbufsize);
 	npxinit(true);
 	/*
 	 * Set up thread0 pcb after npxinit calculated pcb + fpu save
 	 * area size.  Zero out the extended state header in fpu save
 	 * area.
 	 */
 	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 		    1);
 		xhdr->xstate_bv = xsave_mask;
 	}
 	PCPU_SET(curpcb, thread0.td_pcb);
 	/* Move esp0 in the tss to its final place. */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
-	PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16);
+	common_tss0.tss_esp0 = (vm_offset_t)thread0.td_pcb - VM86_STACK_SPACE;
+	PCPU_SET(kesp0, common_tss0.tss_esp0);
 	gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;	/* clear busy bit */
 	ltr(gsel_tss);
 
-	/* make a call gate to reenter kernel with */
-	gdp = &ldt[LSYS5CALLS_SEL].gd;
-
-	x = (int) &IDTVEC(lcall_syscall);
-	gdp->gd_looffset = x;
-	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
-	gdp->gd_stkcpy = 1;
-	gdp->gd_type = SDT_SYS386CGT;
-	gdp->gd_dpl = SEL_UPL;
-	gdp->gd_p = 1;
-	gdp->gd_hioffset = x >> 16;
-
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 #if defined(PAE) || defined(PAE_TABLES)
 	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 #else
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 #endif
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 
 	cpu_probe_amdc1e();
 
 #ifdef FDT
 	x86_init_fdt();
 #endif
 
 	/* Location of kernel stack for locore */
 	return ((register_t)thread0.td_pcb);
 }
 
+extern u_int tramp_idleptd;
+
+static void
+machdep_init_trampoline(void)
+{
+	struct region_descriptor r_gdt, r_idt;
+	struct i386tss *tss;
+	char *copyout_buf, *trampoline, *tramp_stack_base;
+	u_int *tramp_idleptd_reloced;
+	int x;
+
+	gdt = pmap_trm_alloc(sizeof(union descriptor) * NGDT * mp_ncpus,
+	    M_NOWAIT | M_ZERO);
+	bcopy(gdt0, gdt, sizeof(union descriptor) * NGDT);
+	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
+	r_gdt.rd_base = (int)gdt;
+	lgdt(&r_gdt);
+
+	tss = pmap_trm_alloc(sizeof(struct i386tss) * mp_ncpus,
+	    M_NOWAIT | M_ZERO);
+	bcopy(&common_tss0, tss, sizeof(struct i386tss));
+	gdt[GPROC0_SEL].sd.sd_lobase = (int)tss;
+	gdt[GPROC0_SEL].sd.sd_hibase = (u_int)tss >> 24;
+	gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
+	ltr(GSEL(GPROC0_SEL, SEL_KPL));
+
+	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
+	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
+	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
+	PCPU_SET(common_tssp, tss);
+
+	trampoline = pmap_trm_alloc(end_exceptions - start_exceptions,
+	    M_NOWAIT);
+	bcopy(start_exceptions, trampoline, end_exceptions - start_exceptions);
+	tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT);
+	PCPU_SET(trampstk, (uintptr_t)tramp_stack_base + TRAMP_STACK_SZ -
+	    VM86_STACK_SPACE);
+	tss[0].tss_esp0 = PCPU_GET(trampstk);
+
+	idt = pmap_trm_alloc(sizeof(idt0), M_NOWAIT | M_ZERO);
+	bcopy(idt0, idt, sizeof(idt0));
+
+	/* Re-initialize new IDT since the handlers were relocated */
+	setidt_disp = trampoline - start_exceptions;
+	fixup_idt();
+
+	tramp_idleptd_reloced = (u_int *)((uintptr_t)&tramp_idleptd +
+	    setidt_disp);
+#if defined(PAE) || defined(PAE_TABLES)
+	*tramp_idleptd_reloced = (u_int)IdlePDPT;
+#else
+	*tramp_idleptd_reloced = (u_int)IdlePTD;
+#endif
+
+	r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1;
+	r_idt.rd_base = (int)idt;
+	lidt(&r_idt);
+
+	/* dblfault TSS */
+	dblfault_tss = pmap_trm_alloc(sizeof(struct i386tss), M_NOWAIT | M_ZERO);
+	dblfault_stack = pmap_trm_alloc(PAGE_SIZE, M_NOWAIT);
+	dblfault_tss->tss_esp = dblfault_tss->tss_esp0 =
+	    dblfault_tss->tss_esp1 = dblfault_tss->tss_esp2 =
+	    (int)dblfault_stack + PAGE_SIZE;
+	dblfault_tss->tss_ss = dblfault_tss->tss_ss0 = dblfault_tss->tss_ss1 =
+	    dblfault_tss->tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
+#if defined(PAE) || defined(PAE_TABLES)
+	dblfault_tss->tss_cr3 = (int)IdlePDPT;
+#else
+	dblfault_tss->tss_cr3 = (int)IdlePTD;
+#endif
+	dblfault_tss->tss_eip = (int)dblfault_handler;
+	dblfault_tss->tss_eflags = PSL_KERNEL;
+	dblfault_tss->tss_ds = dblfault_tss->tss_es =
+	    dblfault_tss->tss_gs = GSEL(GDATA_SEL, SEL_KPL);
+	dblfault_tss->tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
+	dblfault_tss->tss_cs = GSEL(GCODE_SEL, SEL_KPL);
+	dblfault_tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
+	gdt[GPANIC_SEL].sd.sd_lobase = (int)dblfault_tss;
+	gdt[GPANIC_SEL].sd.sd_hibase = (u_int)dblfault_tss >> 24;
+
+	/* make ldt memory segments */
+	ldt = pmap_trm_alloc(sizeof(union descriptor) * NLDT,
+	    M_NOWAIT | M_ZERO);
+	gdt[GLDT_SEL].sd.sd_lobase = (int)ldt;
+	gdt[GLDT_SEL].sd.sd_hibase = (u_int)ldt >> 24;
+	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
+	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
+	for (x = 0; x < nitems(ldt_segs); x++)
+		ssdtosd(&ldt_segs[x], &ldt[x].sd);
+
+	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
+	lldt(_default_ldt);
+	PCPU_SET(currentldt, _default_ldt);
+
+	copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT);
+	PCPU_SET(copyout_buf, copyout_buf);
+	copyout_init_tramp();
+}
+SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_SECOND, machdep_init_trampoline, NULL);
+
+#ifdef COMPAT_43
+static void
+i386_setup_lcall_gate(void)
+{
+	struct sysentvec *sv;
+	struct user_segment_descriptor desc;
+	u_int lcall_addr;
+
+	sv = &elf32_freebsd_sysvec;
+	lcall_addr = (uintptr_t)sv->sv_psstrings - sz_lcall_tramp;
+
+	bzero(&desc, sizeof(desc));
+	desc.sd_type = SDT_MEMERA;
+	desc.sd_dpl = SEL_UPL;
+	desc.sd_p = 1;
+	desc.sd_def32 = 1;
+	desc.sd_gran = 1;
+	desc.sd_lolimit = 0xffff;
+	desc.sd_hilimit = 0xf;
+	desc.sd_lobase = lcall_addr;
+	desc.sd_hibase = lcall_addr >> 24;
+	bcopy(&desc, &ldt[LSYS5CALLS_SEL], sizeof(desc));
+}
+SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY, i386_setup_lcall_gate, NULL);
+#endif
+
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 static int
 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct bios_smap *smapbase;
 	struct bios_smap_xattr smap;
 	caddr_t kmdp;
 	uint32_t *smapattr;
 	int count, error, i;
 
 	/* Retrieve the system memory map from the loader. */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf32 kernel");
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		return (0);
 	smapattr = (uint32_t *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 	count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase);
 	error = 0;
 	for (i = 0; i < count; i++) {
 		smap.base = smapbase[i].base;
 		smap.length = smapbase[i].length;
 		smap.type = smapbase[i].type;
 		if (smapattr != NULL)
 			smap.xattr = smapattr[i];
 		else
 			smap.xattr = 0;
 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
 	}
 	return (error);
 }
 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	critical_exit();
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(flags);
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 static void
 f00f_hack(void *unused)
 {
+	struct region_descriptor r_idt;
 	struct gate_descriptor *new_idt;
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	GIANT_REQUIRED;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
-	tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO);
+	tmp = (vm_offset_t)pmap_trm_alloc(PAGE_SIZE * 3, M_NOWAIT | M_ZERO);
 	if (tmp == 0)
 		panic("kmem_malloc returned 0");
+	tmp = round_page(tmp);
 
 	/* Put the problematic entry (#6) at the end of the lower page. */
-	new_idt = (struct gate_descriptor*)
+	new_idt = (struct gate_descriptor *)
 	    (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (u_int)new_idt;
+	r_idt.rd_limit = sizeof(idt0) - 1;
 	lidt(&r_idt);
+	/* SMP machines do not need the F00F hack. */
 	idt = new_idt;
 	pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ);
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_edi = tf->tf_edi;
 	pcb->pcb_esi = tf->tf_esi;
 	pcb->pcb_ebp = tf->tf_ebp;
 	pcb->pcb_ebx = tf->tf_ebx;
 	pcb->pcb_eip = tf->tf_eip;
 	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 	pcb->pcb_gs = rgs();
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	pcb = td->td_pcb;
 	regs->r_gs = pcb->pcb_gs;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	pcb = td->td_pcb;
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 	npxgetregs(td);
 	if (cpu_fxsr)
 		npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm,
 		    (struct save87 *)fpregs);
 	else
 		bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs,
 		    sizeof(*fpregs));
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	if (cpu_fxsr)
 		npx_set_fpregs_xmm((struct save87 *)fpregs,
 		    &get_pcb_user_save_td(td)->sv_xmm);
 	else
 		bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87,
 		    sizeof(*fpregs));
 	npxuserinited(td);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 	struct segment_descriptor *sdp;
 
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_esp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_gs = td->td_pcb->pcb_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_edi;
 	mcp->mc_esi = tp->tf_esi;
 	mcp->mc_ebp = tp->tf_ebp;
 	mcp->mc_isp = tp->tf_isp;
 	mcp->mc_eflags = tp->tf_eflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_eax;
 		mcp->mc_edx = tp->tf_edx;
 	}
 	mcp->mc_ebx = tp->tf_ebx;
 	mcp->mc_ecx = tp->tf_ecx;
 	mcp->mc_eip = tp->tf_eip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_esp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp, NULL, 0);
 	sdp = &td->td_pcb->pcb_fsd;
 	mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	mcp->mc_flags = 0;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	char *xfpustate;
 	int eflags, ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp) ||
 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 		return (EINVAL);
 	eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_eflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(union savefpu))
 			return (EINVAL);
 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_fs = mcp->mc_fs;
 	tp->tf_es = mcp->mc_es;
 	tp->tf_ds = mcp->mc_ds;
 	tp->tf_edi = mcp->mc_edi;
 	tp->tf_esi = mcp->mc_esi;
 	tp->tf_ebp = mcp->mc_ebp;
 	tp->tf_ebx = mcp->mc_ebx;
 	tp->tf_edx = mcp->mc_edx;
 	tp->tf_ecx = mcp->mc_ecx;
 	tp->tf_eax = mcp->mc_eax;
 	tp->tf_eip = mcp->mc_eip;
 	tp->tf_eflags = eflags;
 	tp->tf_esp = mcp->mc_esp;
 	tp->tf_ss = mcp->mc_ss;
 	td->td_pcb->pcb_gs = mcp->mc_gs;
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
     size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	mcp->mc_ownedfp = npxgetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = npxformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(union savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
     size_t xfpustate_len)
 {
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 	    mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		error = npxsetregs(td, (union savefpu *)&mcp->mc_fpstate,
 		    xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 static void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == td)
 		npxdrop();
 	/*
 	 * XXX force a full drop of the npx.  The above only drops it if we
 	 * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 	 *
 	 * XXX I don't much like npxgetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of npxgetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
 	    PCB_NPXUSERINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.	Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 		}
 		
 		pcb = td->td_pcb;
 		
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
Index: head/sys/i386/i386/mem.c
===================================================================
--- head/sys/i386/i386/mem.c	(revision 332488)
+++ head/sys/i386/i386/mem.c	(revision 332489)
@@ -1,236 +1,233 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1982, 1986, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and code derived from software contributed to
  * Berkeley by William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: Utah $Hdr: mem.c 1.13 89/10/08$
  *	from: @(#)mem.c	7.2 (Berkeley) 5/9/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Memory special file
  */
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/ioccom.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 
 #include <machine/specialreg.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 
 #include <machine/memdev.h>
 
 /*
  * Used in /dev/mem drivers and elsewhere
  */
 MALLOC_DEFINE(M_MEMDESC, "memdesc", "memory range descriptors");
 
 static struct sx memsxlock;
 SX_SYSINIT(memsxlockinit, &memsxlock, "/dev/mem lock");
 
 /* ARGSUSED */
 int
 memrw(struct cdev *dev, struct uio *uio, int flags)
 {
 	int o;
 	u_int c = 0;
 	vm_paddr_t pa;
 	struct iovec *iov;
 	int error = 0;
 	vm_offset_t addr;
 
 	if (dev2unit(dev) != CDEV_MINOR_MEM && dev2unit(dev) != CDEV_MINOR_KMEM)
 		return EIO;
 
 	if (dev2unit(dev) == CDEV_MINOR_KMEM && uio->uio_resid > 0) {
-		if (uio->uio_offset < (vm_offset_t)VADDR(PTDPTDI, 0))
-				return (EFAULT);
-
 		if (!kernacc((caddr_t)(int)uio->uio_offset, uio->uio_resid,
 		    uio->uio_rw == UIO_READ ?  VM_PROT_READ : VM_PROT_WRITE))
 			return (EFAULT);
 	}
 
 	while (uio->uio_resid > 0 && error == 0) {
 		iov = uio->uio_iov;
 		if (iov->iov_len == 0) {
 			uio->uio_iov++;
 			uio->uio_iovcnt--;
 			if (uio->uio_iovcnt < 0)
 				panic("memrw");
 			continue;
 		}
 		if (dev2unit(dev) == CDEV_MINOR_MEM) {
 			if (uio->uio_offset > cpu_getmaxphyaddr()) {
 				error = EFAULT;
 				break;
 			}
 			pa = trunc_page(uio->uio_offset);
 		} else {
 			/*
 			 * Extract the physical page since the mapping may
 			 * change at any time. This avoids panics on page 
 			 * fault in this case but will cause reading/writing
 			 * to the wrong page.
 			 * Hopefully an application will notice the wrong
 			 * data on read access and refrain from writing.
 			 * This should be replaced by a special uiomove
 			 * type function that just returns an error if there
 			 * is a page fault on a kernel page. 
 			 */
 			addr = trunc_page(uio->uio_offset);
 			pa = pmap_extract(kernel_pmap, addr);
 			if (pa == 0) 
 				return EFAULT;
 
 		}
 		
 		/* 
 		 * XXX UPS This should just use sf_buf_alloc.
 		 * Unfortunately sf_buf_alloc needs a vm_page
 		 * and we may want to look at memory not covered
 		 * by the page array.
 		 */
 
 		sx_xlock(&memsxlock);
 		pmap_kenter((vm_offset_t)ptvmmap, pa);
 		pmap_invalidate_page(kernel_pmap,(vm_offset_t)ptvmmap);
 
 		o = (int)uio->uio_offset & PAGE_MASK;
 		c = PAGE_SIZE - o;
 		c = min(c, (u_int)iov->iov_len);
 		error = uiomove((caddr_t)&ptvmmap[o], (int)c, uio);
 		pmap_qremove((vm_offset_t)ptvmmap, 1);
 		sx_xunlock(&memsxlock);
 		
 	}
 
 	return (error);
 }
 
 /*
  * allow user processes to MMAP some memory sections
  * instead of going through read/write
  */
 /* ARGSUSED */
 int
 memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int prot __unused, vm_memattr_t *memattr __unused)
 {
 	if (dev2unit(dev) == CDEV_MINOR_MEM) {
 		if (offset > cpu_getmaxphyaddr())
 			return (-1);
 		*paddr = offset;
 		return (0);
 	}
 	return (-1);
 }
 
 /*
  * Operations for changing memory attributes.
  *
  * This is basically just an ioctl shim for mem_range_attr_get
  * and mem_range_attr_set.
  */
 /* ARGSUSED */
 int 
 memioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int flags,
     struct thread *td)
 {
 	int nd, error = 0;
 	struct mem_range_op *mo = (struct mem_range_op *)data;
 	struct mem_range_desc *md;
 	
 	/* is this for us? */
 	if ((cmd != MEMRANGE_GET) &&
 	    (cmd != MEMRANGE_SET))
 		return (ENOTTY);
 
 	/* any chance we can handle this? */
 	if (mem_range_softc.mr_op == NULL)
 		return (EOPNOTSUPP);
 
 	/* do we have any descriptors? */
 	if (mem_range_softc.mr_ndesc == 0)
 		return (ENXIO);
 
 	switch (cmd) {
 	case MEMRANGE_GET:
 		nd = imin(mo->mo_arg[0], mem_range_softc.mr_ndesc);
 		if (nd > 0) {
 			md = (struct mem_range_desc *)
 				malloc(nd * sizeof(struct mem_range_desc),
 				       M_MEMDESC, M_WAITOK);
 			error = mem_range_attr_get(md, &nd);
 			if (!error)
 				error = copyout(md, mo->mo_desc, 
 					nd * sizeof(struct mem_range_desc));
 			free(md, M_MEMDESC);
 		}
 		else
 			nd = mem_range_softc.mr_ndesc;
 		mo->mo_arg[0] = nd;
 		break;
 		
 	case MEMRANGE_SET:
 		md = (struct mem_range_desc *)malloc(sizeof(struct mem_range_desc),
 						    M_MEMDESC, M_WAITOK);
 		error = copyin(mo->mo_desc, md, sizeof(struct mem_range_desc));
 		/* clamp description string */
 		md->mr_owner[sizeof(md->mr_owner) - 1] = 0;
 		if (error == 0)
 			error = mem_range_attr_set(md, &mo->mo_arg[0]);
 		free(md, M_MEMDESC);
 		break;
 	}
 	return (error);
 }
Index: head/sys/i386/i386/minidump_machdep.c
===================================================================
--- head/sys/i386/i386/minidump_machdep.c	(revision 332488)
+++ head/sys/i386/i386/minidump_machdep.c	(revision 332489)
@@ -1,377 +1,377 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/msgbuf.h>
 #include <sys/watchdog.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <machine/atomic.h>
 #include <machine/elf.h>
 #include <machine/md_var.h>
 #include <machine/vmparam.h>
 #include <machine/minidump.h>
 
 CTASSERT(sizeof(struct kerneldumpheader) == 512);
 
 #define	MD_ALIGN(x)	(((off_t)(x) + PAGE_MASK) & ~PAGE_MASK)
 #define	DEV_ALIGN(x)	roundup2((off_t)(x), DEV_BSIZE)
 
 uint32_t *vm_page_dump;
 int vm_page_dump_size;
 
 static struct kerneldumpheader kdh;
 
 /* Handle chunked writes. */
 static size_t fragsz;
 static void *dump_va;
 static uint64_t counter, progress;
 
 CTASSERT(sizeof(*vm_page_dump) == 4);
 
 
 static int
 is_dumpable(vm_paddr_t pa)
 {
 	int i;
 
 	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
 		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
 			return (1);
 	}
 	return (0);
 }
 
 #define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8)
 
 static int
 blk_flush(struct dumperinfo *di)
 {
 	int error;
 
 	if (fragsz == 0)
 		return (0);
 
 	error = dump_append(di, dump_va, 0, fragsz);
 	fragsz = 0;
 	return (error);
 }
 
 static int
 blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz)
 {
 	size_t len;
 	int error, i, c;
 	u_int maxdumpsz;
 
 	maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE);
 	if (maxdumpsz == 0)	/* seatbelt */
 		maxdumpsz = PAGE_SIZE;
 	error = 0;
 	if ((sz % PAGE_SIZE) != 0) {
 		printf("size not page aligned\n");
 		return (EINVAL);
 	}
 	if (ptr != NULL && pa != 0) {
 		printf("cant have both va and pa!\n");
 		return (EINVAL);
 	}
 	if (pa != 0 && (((uintptr_t)ptr) % PAGE_SIZE) != 0) {
 		printf("address not page aligned\n");
 		return (EINVAL);
 	}
 	if (ptr != NULL) {
 		/* If we're doing a virtual dump, flush any pre-existing pa pages */
 		error = blk_flush(di);
 		if (error)
 			return (error);
 	}
 	while (sz) {
 		len = maxdumpsz - fragsz;
 		if (len > sz)
 			len = sz;
 		counter += len;
 		progress -= len;
 		if (counter >> 24) {
 			printf(" %lld", PG2MB(progress >> PAGE_SHIFT));
 			counter &= (1<<24) - 1;
 		}
 
 		wdog_kern_pat(WD_LASTVAL);
 
 		if (ptr) {
 			error = dump_append(di, ptr, 0, len);
 			if (error)
 				return (error);
 			ptr += len;
 			sz -= len;
 		} else {
 			for (i = 0; i < len; i += PAGE_SIZE)
 				dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT);
 			fragsz += len;
 			pa += len;
 			sz -= len;
 			if (fragsz == maxdumpsz) {
 				error = blk_flush(di);
 				if (error)
 					return (error);
 			}
 		}
 
 		/* Check for user abort. */
 		c = cncheckc();
 		if (c == 0x03)
 			return (ECANCELED);
 		if (c != -1)
 			printf(" (CTRL-C to abort) ");
 	}
 
 	return (0);
 }
 
 /* A fake page table page, to avoid having to handle both 4K and 2M pages */
 static pt_entry_t fakept[NPTEPG];
 
 int
 minidumpsys(struct dumperinfo *di)
 {
 	uint64_t dumpsize;
 	uint32_t ptesize;
 	vm_offset_t va;
 	int error;
 	uint32_t bits;
 	uint64_t pa;
 	pd_entry_t *pd;
 	pt_entry_t *pt;
 	int i, j, k, bit;
 	struct minidumphdr mdhdr;
 
 	counter = 0;
 	/* Walk page table pages, set bits in vm_page_dump */
 	ptesize = 0;
 	for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) {
 		/*
 		 * We always write a page, even if it is zero. Each
 		 * page written corresponds to 2MB of space
 		 */
 		ptesize += PAGE_SIZE;
-		pd = (pd_entry_t *)((uintptr_t)IdlePTD + KERNBASE);	/* always mapped! */
+		pd = IdlePTD;	/* always mapped! */
 		j = va >> PDRSHIFT;
 		if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
 			/* This is an entire 2M page. */
 			pa = pd[j] & PG_PS_FRAME;
 			for (k = 0; k < NPTEPG; k++) {
 				if (is_dumpable(pa))
 					dump_add_page(pa);
 				pa += PAGE_SIZE;
 			}
 			continue;
 		}
 		if ((pd[j] & PG_V) == PG_V) {
 			/* set bit for each valid page in this 2MB block */
 			pt = pmap_kenter_temporary(pd[j] & PG_FRAME, 0);
 			for (k = 0; k < NPTEPG; k++) {
 				if ((pt[k] & PG_V) == PG_V) {
 					pa = pt[k] & PG_FRAME;
 					if (is_dumpable(pa))
 						dump_add_page(pa);
 				}
 			}
 		} else {
 			/* nothing, we're going to dump a null page */
 		}
 	}
 
 	/* Calculate dump size. */
 	dumpsize = ptesize;
 	dumpsize += round_page(msgbufp->msg_size);
 	dumpsize += round_page(vm_page_dump_size);
 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
 		bits = vm_page_dump[i];
 		while (bits) {
 			bit = bsfl(bits);
 			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
 			/* Clear out undumpable pages now if needed */
 			if (is_dumpable(pa)) {
 				dumpsize += PAGE_SIZE;
 			} else {
 				dump_drop_page(pa);
 			}
 			bits &= ~(1ul << bit);
 		}
 	}
 	dumpsize += PAGE_SIZE;
 
 	progress = dumpsize;
 
 	/* Initialize mdhdr */
 	bzero(&mdhdr, sizeof(mdhdr));
 	strcpy(mdhdr.magic, MINIDUMP_MAGIC);
 	mdhdr.version = MINIDUMP_VERSION;
 	mdhdr.msgbufsize = msgbufp->msg_size;
 	mdhdr.bitmapsize = vm_page_dump_size;
 	mdhdr.ptesize = ptesize;
 	mdhdr.kernbase = KERNBASE;
 #if defined(PAE) || defined(PAE_TABLES)
 	mdhdr.paemode = 1;
 #endif
 
 	dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_I386_VERSION,
 	    dumpsize);
 
 	printf("Physical memory: %ju MB\n", ptoa((uintmax_t)physmem) / 1048576);
 	printf("Dumping %llu MB:", (long long)dumpsize >> 20);
 
 	error = dump_start(di, &kdh);
 	if (error != 0)
 		goto fail;
 
 	/* Dump my header */
 	bzero(&fakept, sizeof(fakept));
 	bcopy(&mdhdr, &fakept, sizeof(mdhdr));
 	error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
 	if (error)
 		goto fail;
 
 	/* Dump msgbuf up front */
 	error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size));
 	if (error)
 		goto fail;
 
 	/* Dump bitmap */
 	error = blk_write(di, (char *)vm_page_dump, 0, round_page(vm_page_dump_size));
 	if (error)
 		goto fail;
 
 	/* Dump kernel page table pages */
 	for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) {
 		/* We always write a page, even if it is zero */
-		pd = (pd_entry_t *)((uintptr_t)IdlePTD + KERNBASE);	/* always mapped! */
+		pd = IdlePTD;	/* always mapped! */
 		j = va >> PDRSHIFT;
 		if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
 			/* This is a single 2M block. Generate a fake PTP */
 			pa = pd[j] & PG_PS_FRAME;
 			for (k = 0; k < NPTEPG; k++) {
 				fakept[k] = (pa + (k * PAGE_SIZE)) | PG_V | PG_RW | PG_A | PG_M;
 			}
 			error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
 			if (error)
 				goto fail;
 			/* flush, in case we reuse fakept in the same block */
 			error = blk_flush(di);
 			if (error)
 				goto fail;
 			continue;
 		}
 		if ((pd[j] & PG_V) == PG_V) {
 			pa = pd[j] & PG_FRAME;
 			error = blk_write(di, 0, pa, PAGE_SIZE);
 			if (error)
 				goto fail;
 		} else {
 			bzero(fakept, sizeof(fakept));
 			error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
 			if (error)
 				goto fail;
 			/* flush, in case we reuse fakept in the same block */
 			error = blk_flush(di);
 			if (error)
 				goto fail;
 		}
 	}
 
 	/* Dump memory chunks */
 	/* XXX cluster it up and use blk_dump() */
 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
 		bits = vm_page_dump[i];
 		while (bits) {
 			bit = bsfl(bits);
 			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
 			error = blk_write(di, 0, pa, PAGE_SIZE);
 			if (error)
 				goto fail;
 			bits &= ~(1ul << bit);
 		}
 	}
 
 	error = blk_flush(di);
 	if (error)
 		goto fail;
 
 	error = dump_finish(di, &kdh);
 	if (error != 0)
 		goto fail;
 
 	printf("\nDump complete\n");
 	return (0);
 
  fail:
 	if (error < 0)
 		error = -error;
 
 	if (error == ECANCELED)
 		printf("\nDump aborted\n");
 	else if (error == E2BIG || error == ENOSPC)
 		printf("\nDump failed. Partition too small.\n");
 	else
 		printf("\n** DUMP FAILED (ERROR %d) **\n", error);
 	return (error);
 }
 
 void
 dump_add_page(vm_paddr_t pa)
 {
 	int idx, bit;
 
 	pa >>= PAGE_SHIFT;
 	idx = pa >> 5;		/* 2^5 = 32 */
 	bit = pa & 31;
 	atomic_set_int(&vm_page_dump[idx], 1ul << bit);
 }
 
 void
 dump_drop_page(vm_paddr_t pa)
 {
 	int idx, bit;
 
 	pa >>= PAGE_SHIFT;
 	idx = pa >> 5;		/* 2^5 = 32 */
 	bit = pa & 31;
 	atomic_clear_int(&vm_page_dump[idx], 1ul << bit);
 }
 
Index: head/sys/i386/i386/mp_machdep.c
===================================================================
--- head/sys/i386/i386/mp_machdep.c	(revision 332488)
+++ head/sys/i386/i386/mp_machdep.c	(revision 332489)
@@ -1,455 +1,464 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_cpu.h"
 #include "opt_kstack_pages.h"
 #include "opt_pmap.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 
 #if !defined(lint)
 #if !defined(SMP)
 #error How did you get here?
 #endif
 
 #ifndef DEV_APIC
 #error The apic device is required for SMP, add "device apic" to your config file.
 #endif
 #endif /* not lint */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cons.h>	/* cngetc() */
 #include <sys/cpuset.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/cpu.h>
 
 #define WARMBOOT_TARGET		0
-#define WARMBOOT_OFF		(KERNBASE + 0x0467)
-#define WARMBOOT_SEG		(KERNBASE + 0x0469)
+#define WARMBOOT_OFF		(PMAP_MAP_LOW + 0x0467)
+#define WARMBOOT_SEG		(PMAP_MAP_LOW + 0x0469)
 
 #define CMOS_REG		(0x70)
 #define CMOS_DATA		(0x71)
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
 /*
  * this code MUST be enabled here and in mpboot.s.
  * it follows the very early stages of AP boot by placing values in CMOS ram.
  * it NORMALLY will never be needed and thus the primitive method for enabling.
  *
 #define CHECK_POINTS
  */
 
 #if defined(CHECK_POINTS)
 #define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
 
 #define CHECK_INIT(D);				\
 	CHECK_WRITE(0x34, (D));			\
 	CHECK_WRITE(0x35, (D));			\
 	CHECK_WRITE(0x36, (D));			\
 	CHECK_WRITE(0x37, (D));			\
 	CHECK_WRITE(0x38, (D));			\
 	CHECK_WRITE(0x39, (D));
 
 #define CHECK_PRINT(S);				\
 	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
 	   (S),					\
 	   CHECK_READ(0x34),			\
 	   CHECK_READ(0x35),			\
 	   CHECK_READ(0x36),			\
 	   CHECK_READ(0x37),			\
 	   CHECK_READ(0x38),			\
 	   CHECK_READ(0x39));
 
 #else				/* CHECK_POINTS */
 
 #define CHECK_INIT(D)
 #define CHECK_PRINT(S)
 #define CHECK_WRITE(A, D)
 
 #endif				/* CHECK_POINTS */
 
 extern	struct pcpu __pcpu[];
 
 /*
  * Local data and functions.
  */
 
 static void	install_ap_tramp(void);
 static int	start_all_aps(void);
 static int	start_ap(int apic_id);
 
+static char *ap_copyout_buf;
+static char *ap_tramp_stack_base;
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
 void
 cpu_mp_start(void)
 {
 	int i;
 
 	/* Initialize the logical ID to APIC ID table. */
 	for (i = 0; i < MAXCPU; i++) {
 		cpu_apic_ids[i] = -1;
 		cpu_ipi_pending[i] = 0;
 	}
 
 	/* Install an inter-CPU IPI for TLB invalidation */
 	setidt(IPI_INVLTLB, IDTVEC(invltlb),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLPG, IDTVEC(invlpg),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLRNG, IDTVEC(invlrng),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for cache invalidation. */
 	setidt(IPI_INVLCACHE, IDTVEC(invlcache),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
 	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install generic inter-CPU IPI handler */
 	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for CPU stop/restart */
 	setidt(IPI_STOP, IDTVEC(cpustop),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for CPU suspend/resume */
 	setidt(IPI_SUSPEND, IDTVEC(cpususpend),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
 		boot_cpu_id = PCPU_GET(apic_id);
 		cpu_info[boot_cpu_id].cpu_bsp = 1;
 	} else
 		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
 
 	/* Probe logical/physical core configuration. */
 	topo_probe();
 
 	assign_cpu_ids();
 
 	/* Start each Application Processor */
 	start_all_aps();
 
 	set_interrupt_apic_ids();
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary(void)
 {
 	struct pcpu *pc;
-	vm_offset_t addr;
-	int	gsel_tss;
-	int	x, myid;
-	u_int	cr0;
+	struct i386tss *common_tssp;
+	struct region_descriptor r_gdt, r_idt;
+	int gsel_tss, myid, x;
+	u_int cr0;
 
 	/* bootAP is set in start_ap() to our ID. */
 	myid = bootAP;
 
 	/* Get per-cpu data */
 	pc = &__pcpu[myid];
 
 	/* prime data page for it to use */
 	pcpu_init(pc, myid, sizeof(struct pcpu));
 	dpcpu_init(dpcpu, myid);
 	pc->pc_apic_id = cpu_apic_ids[myid];
 	pc->pc_prvspace = pc;
 	pc->pc_curthread = 0;
+	pc->pc_common_tssp = common_tssp = &(__pcpu[0].pc_common_tssp)[myid];
 
 	fix_cpuid();
 
-	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
-	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
+	gdt_segs[GPRIV_SEL].ssd_base = (int)pc;
+	gdt_segs[GPROC0_SEL].ssd_base = (int)common_tssp;
+	gdt_segs[GLDT_SEL].ssd_base = (int)ldt;
 
 	for (x = 0; x < NGDT; x++) {
 		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
 	}
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base = (int) &gdt[myid * NGDT];
 	lgdt(&r_gdt);			/* does magic intra-segment return */
 
+	r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1;
+	r_idt.rd_base = (int)idt;
 	lidt(&r_idt);
 
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
+	PCPU_SET(trampstk, (uintptr_t)ap_tramp_stack_base + TRAMP_STACK_SZ -
+	    VM86_STACK_SPACE);
+
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
-	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
-	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
-	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
+	common_tssp->tss_esp0 = PCPU_GET(trampstk);
+	common_tssp->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
+	common_tssp->tss_ioopt = sizeof(struct i386tss) << 16;
 	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	ltr(gsel_tss);
 
 	PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd);
+	PCPU_SET(copyout_buf, ap_copyout_buf);
 
 	/*
 	 * Set to a known state:
 	 * Set by mpboot.s: CR0_PG, CR0_PE
 	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
 	 */
 	cr0 = rcr0();
 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
 	load_cr0(cr0);
 	CHECK_WRITE(0x38, 5);
 	
 	/* signal our startup to the BSP. */
 	mp_naps++;
 	CHECK_WRITE(0x39, 6);
 
 	/* Spin until the BSP releases the AP's. */
 	while (atomic_load_acq_int(&aps_ready) == 0)
 		ia32_pause();
 
 	/* BSP may have changed PTD while we were waiting */
 	invltlb();
-	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
-		invlpg(addr);
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 	lidt(&r_idt);
 #endif
 
 	init_secondary_tail();
 }
 
 /*
  * start each AP in our list
  */
-/* Lowest 1MB is already mapped: don't touch*/
 #define TMPMAP_START 1
 static int
 start_all_aps(void)
 {
 	u_char mpbiosreason;
 	u_int32_t mpbioswarmvec;
-	int apic_id, cpu, i;
+	int apic_id, cpu;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
+	/* Remap lowest 1MB */
+	IdlePTD[0] = IdlePTD[1];
+	load_cr3(rcr3());		/* invalidate TLB */
+
 	/* install the AP 1st level boot code */
 	install_ap_tramp();
 
 	/* save the current value of the warm-start vector */
 	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
 	outb(CMOS_REG, BIOS_RESET);
 	mpbiosreason = inb(CMOS_DATA);
 
-	/* set up temporary P==V mapping for AP boot */
-	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
-	for (i = TMPMAP_START; i < NKPT; i++)
-		PTD[i] = PTD[KPTDI + i];
-	invltlb();
+	/* take advantage of the P==V mapping for PTD[0] for AP boot */
 
 	/* start each AP */
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		apic_id = cpu_apic_ids[cpu];
 
 		/* allocate and set up a boot stack data page */
 		bootstacks[cpu] =
 		    (char *)kmem_malloc(kernel_arena, kstack_pages * PAGE_SIZE,
 		    M_WAITOK | M_ZERO);
 		dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
 		    M_WAITOK | M_ZERO);
 		/* setup a vector to our boot code */
 		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
 		*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
 		outb(CMOS_REG, BIOS_RESET);
 		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 
 		bootSTK = (char *)bootstacks[cpu] + kstack_pages *
 		    PAGE_SIZE - 4;
 		bootAP = cpu;
 
+		ap_tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT);
+		ap_copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT);
+
 		/* attempt to start the Application Processor */
 		CHECK_INIT(99);	/* setup checkpoints */
 		if (!start_ap(apic_id)) {
 			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
 			CHECK_PRINT("trace");	/* show checkpoints */
 			/* better panic as the AP may be running loose */
 			printf("panic y/n? [y] ");
 			if (cngetc() != 'n')
 				panic("bye-bye");
 		}
 		CHECK_PRINT("trace");		/* show checkpoints */
 
 		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
 	}
 
+	/* Unmap lowest 1MB again */
+	IdlePTD[0] = 0;
+	load_cr3(rcr3());
+
 	/* restore the warmstart vector */
 	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
 
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 
-	/* Undo V==P hack from above */
-	for (i = TMPMAP_START; i < NKPT; i++)
-		PTD[i] = 0;
-	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
-
 	/* number of APs actually started */
 	return mp_naps;
 }
 
 /*
  * load the 1st level AP boot code into base memory.
  */
 
 /* targets for relocation */
 extern void bigJump(void);
 extern void bootCodeSeg(void);
 extern void bootDataSeg(void);
 extern void MPentry(void);
 extern u_int MP_GDT;
 extern u_int mp_gdtbase;
 
 static void
 install_ap_tramp(void)
 {
 	int     x;
 	int     size = *(int *) ((u_long) & bootMP_size);
-	vm_offset_t va = boot_address + KERNBASE;
+	vm_offset_t va = boot_address;
 	u_char *src = (u_char *) ((u_long) bootMP);
 	u_char *dst = (u_char *) va;
 	u_int   boot_base = (u_int) bootMP;
 	u_int8_t *dst8;
 	u_int16_t *dst16;
 	u_int32_t *dst32;
 
 	KASSERT (size <= PAGE_SIZE,
 	    ("'size' do not fit into PAGE_SIZE, as expected."));
 	pmap_kenter(va, boot_address);
 	pmap_invalidate_page (kernel_pmap, va);
 	for (x = 0; x < size; ++x)
 		*dst++ = *src++;
 
 	/*
 	 * modify addresses in code we just moved to basemem. unfortunately we
 	 * need fairly detailed info about mpboot.s for this to work.  changes
 	 * to mpboot.s might require changes here.
 	 */
 
 	/* boot code is located in KERNEL space */
 	dst = (u_char *) va;
 
 	/* modify the lgdt arg */
 	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
 	*dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
 
 	/* modify the ljmp target for MPentry() */
 	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
-	*dst32 = ((u_int) MPentry - KERNBASE);
+	*dst32 = (u_int)MPentry;
 
 	/* modify the target for boot code segment */
 	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
 	dst8 = (u_int8_t *) (dst16 + 1);
 	*dst16 = (u_int) boot_address & 0xffff;
 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
 
 	/* modify the target for boot data segment */
 	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
 	dst8 = (u_int8_t *) (dst16 + 1);
 	*dst16 = (u_int) boot_address & 0xffff;
 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
 }
 
 /*
  * This function starts the AP (application processor) identified
  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
  * to accomplish this.  This is necessary because of the nuances
  * of the different hardware we might encounter.  It isn't pretty,
  * but it seems to work.
  */
 static int
 start_ap(int apic_id)
 {
 	int vector, ms;
 	int cpus;
 
 	/* calculate the vector */
 	vector = (boot_address >> 12) & 0xff;
 
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
 	ipi_startup(apic_id, vector);
 
 	/* Wait up to 5 seconds for it to start. */
 	for (ms = 0; ms < 5000; ms++) {
 		if (mp_naps > cpus)
 			return 1;	/* return SUCCESS */
 		DELAY(1000);
 	}
 	return 0;		/* return FAILURE */
 }
Index: head/sys/i386/i386/mpboot.s
===================================================================
--- head/sys/i386/i386/mpboot.s	(revision 332488)
+++ head/sys/i386/i386/mpboot.s	(revision 332489)
@@ -1,279 +1,273 @@
 /*-
  * Copyright (c) 1995 Jack F. Vogel
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * mpboot.s:	FreeBSD machine support for the Intel MP Spec
  *		multiprocessor systems.
  *
  * $FreeBSD$
  */
 
 #include "opt_pmap.h"
 
 #include <machine/asmacros.h>		/* miscellaneous asm macros */
 #include <x86/apicreg.h>
 #include <machine/specialreg.h>
 
 #include "assym.inc"
 
-#define	R(x)	((x)-KERNBASE)
-
 /*
  * this code MUST be enabled here and in mp_machdep.c
  * it follows the very early stages of AP boot by placing values in CMOS ram.
  * it NORMALLY will never be needed and thus the primitive method for enabling.
  *
 #define CHECK_POINTS
  */
 
 #if defined(CHECK_POINTS)
 
 #define CMOS_REG	(0x70)
 #define CMOS_DATA	(0x71)
 
 #define CHECKPOINT(A,D)		\
 	movb	$(A),%al ;	\
 	outb	%al,$CMOS_REG ;	\
 	movb	$(D),%al ;	\
 	outb	%al,$CMOS_DATA
 
 #else
 
 #define CHECKPOINT(A,D)
 
 #endif /* CHECK_POINTS */
 
 
 /*
  * the APs enter here from their trampoline code (bootMP, below)
  */
 	.p2align 4
 
 NON_GPROF_ENTRY(MPentry)
 	CHECKPOINT(0x36, 3)
 	/*
 	 * Enable features on this processor.  We don't support SMP on
 	 * CPUs older than a Pentium, so we know that we can use the cpuid
 	 * instruction.
 	 */
 	movl	$1,%eax
 	cpuid					/* Retrieve features */
 	movl	%cr4,%eax
-#ifndef DISABLE_PSE
 	testl	$CPUID_PSE,%edx
 	jz 1f
 	orl	$CR4_PSE,%eax			/* Enable PSE  */
 1:
-#endif
-#ifndef DISABLE_PG_G
 	testl	$CPUID_PGE,%edx
 	jz 1f
 	orl	$CR4_PGE,%eax			/* Enable PGE  */
 1:	
-#endif
 	testl	$CPUID_VME,%edx
 	jz 1f
 	orl	$CR4_VME,%eax			/* Enable VME  */
 1:
 	movl	%eax,%cr4
 
 	/* Now enable paging mode */
 #if defined(PAE) || defined(PAE_TABLES)
-	movl	R(IdlePDPT), %eax
+	movl	IdlePDPT, %eax
 	movl	%eax, %cr3
 	movl	%cr4, %eax
 	orl	$CR4_PAE, %eax
 	movl	%eax, %cr4
 #else
-	movl	R(IdlePTD), %eax
+	movl	IdlePTD, %eax
 	movl	%eax,%cr3	
 #endif
 	movl	%cr0,%eax
 	orl	$CR0_PE|CR0_PG,%eax		/* enable paging */
 	movl	%eax,%cr0			/* let the games begin! */
 	movl	bootSTK,%esp			/* boot stack end loc. */
 
 	pushl	$mp_begin			/* jump to high mem */
 	ret
 
 	/*
 	 * Wait for the booting CPU to signal startup
 	 */
 mp_begin:	/* now running relocated at KERNBASE */
 	CHECKPOINT(0x37, 4)
 	call	init_secondary			/* load i386 tables */
 
 /*
  * This is the embedded trampoline or bootstrap that is
  * copied into 'real-mode' low memory, it is where the
  * secondary processor "wakes up". When it is executed
  * the processor will eventually jump into the routine
  * MPentry, which resides in normal kernel text above
  * 1Meg.		-jackv
  */
 
 	.data
 	ALIGN_DATA				/* just to be sure */
 
 BOOTMP1:
 
 NON_GPROF_ENTRY(bootMP)
 	.code16		
 	cli
 	CHECKPOINT(0x34, 1)
 	/* First guarantee a 'clean slate' */
 	xorl	%eax, %eax
 	movl	%eax, %ebx
 	movl	%eax, %ecx
  	movl	%eax, %edx
 	movl	%eax, %esi
 	movl	%eax, %edi
 
 	/* set up data segments */
 	mov	%cs, %ax
 	mov	%ax, %ds
 	mov	%ax, %es
 	mov	%ax, %fs
 	mov	%ax, %gs
 	mov	%ax, %ss
 	mov	$(boot_stk-bootMP), %esp
 
 	/* Now load the global descriptor table */
 	lgdt	MP_GDTptr-bootMP
 
 	/* Enable protected mode */
 	movl	%cr0, %eax
 	orl	$CR0_PE, %eax
 	movl	%eax, %cr0 
 
 	/*
 	 * make intrasegment jump to flush the processor pipeline and
 	 * reload CS register
 	 */
 	pushl	$0x18
 	pushl	$(protmode-bootMP)
 	lretl
 
        .code32		
 protmode:
 	CHECKPOINT(0x35, 2)
 
 	/*
 	 * we are NOW running for the first time with %eip
 	 * having the full physical address, BUT we still
 	 * are using a segment descriptor with the origin
 	 * not matching the booting kernel.
 	 *
  	 * SO NOW... for the BIG Jump into kernel's segment
 	 * and physical text above 1 Meg.
 	 */
 	mov	$0x10, %ebx
 	movw	%bx, %ds
 	movw	%bx, %es
 	movw	%bx, %fs
 	movw	%bx, %gs
 	movw	%bx, %ss
 
 	.globl	bigJump
 bigJump:
 	/* this will be modified by mpInstallTramp() */
 	ljmp	$0x08, $0			/* far jmp to MPentry() */
 	
 dead:	hlt /* We should never get here */
 	jmp	dead
 
 /*
  * MP boot strap Global Descriptor Table
  */
 	.p2align 4
 	.globl	MP_GDT
 	.globl	bootCodeSeg
 	.globl	bootDataSeg
 MP_GDT:
 
 nulldesc:		/* offset = 0x0 */
 
 	.word	0x0	
 	.word	0x0	
 	.byte	0x0	
 	.byte	0x0	
 	.byte	0x0	
 	.byte	0x0	
 
 kernelcode:		/* offset = 0x08 */
 
 	.word	0xffff	/* segment limit 0..15 */
 	.word	0x0000	/* segment base 0..15 */
 	.byte	0x0	/* segment base 16..23; set for 0K */
 	.byte	0x9f	/* flags; Type	*/
 	.byte	0xcf	/* flags; Limit	*/
 	.byte	0x0	/* segment base 24..32 */
 
 kerneldata:		/* offset = 0x10 */
 
 	.word	0xffff	/* segment limit 0..15 */
 	.word	0x0000	/* segment base 0..15 */
 	.byte	0x0	/* segment base 16..23; set for 0k */
 	.byte	0x93	/* flags; Type  */
 	.byte	0xcf	/* flags; Limit */
 	.byte	0x0	/* segment base 24..32 */
 
 bootcode:		/* offset = 0x18 */
 
 	.word	0xffff	/* segment limit 0..15 */
 bootCodeSeg:		/* this will be modified by mpInstallTramp() */
 	.word	0x0000	/* segment base 0..15 */
 	.byte	0x00	/* segment base 16...23; set for 0x000xx000 */
 	.byte	0x9e	/* flags; Type  */
 	.byte	0xcf	/* flags; Limit */
 	.byte	0x0	/*segment base 24..32 */
 
 bootdata:		/* offset = 0x20 */
 
 	.word	0xffff	
 bootDataSeg:		/* this will be modified by mpInstallTramp() */
 	.word	0x0000	/* segment base 0..15 */
 	.byte	0x00	/* segment base 16...23; set for 0x000xx000 */
 	.byte	0x92	
 	.byte	0xcf	
 	.byte	0x0		
 
 /*
  * GDT pointer for the lgdt call
  */
 	.globl	mp_gdtbase
 
 MP_GDTptr:	
 mp_gdtlimit:
 	.word	0x0028		
 mp_gdtbase:		/* this will be modified by mpInstallTramp() */
 	.long	0
 
 	.space	0x100	/* space for boot_stk - 1st temporary stack */
 boot_stk:
 
 BOOTMP2:
 	.globl	bootMP_size
 bootMP_size:
 	.long	BOOTMP2 - BOOTMP1
Index: head/sys/i386/i386/pmap.c
===================================================================
--- head/sys/i386/i386/pmap.c	(revision 332488)
+++ head/sys/i386/i386/pmap.c	(revision 332489)
@@ -1,5675 +1,5702 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_apic.h"
 #include "opt_cpu.h"
 #include "opt_pmap.h"
 #include "opt_smp.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
+#include <sys/vmem.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #ifdef DEV_APIC
 #include <sys/bus.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #endif
+#include <machine/bootinfo.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
     atomic_clear_int((u_int *)(pte), PG_W))
 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 struct pmap kernel_pmap_store;
-LIST_HEAD(pmaplist, pmap);
-static struct pmaplist allpmaps;
-static struct mtx allpmaps_lock;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 int pgeflag = 0;		/* PG_G or-in */
 int pseflag = 0;		/* PG_PS or-in */
 
 static int nkpt = NKPT;
-vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
-extern u_int32_t KERNend;
-extern u_int32_t KPTphys;
+vm_offset_t kernel_vm_end = /* 0 + */ NKPT * NBPDR;
 
 #if defined(PAE) || defined(PAE_TABLES)
 pt_entry_t pg_nx;
 static uma_zone_t pdptzone;
 #endif
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
 static int pat_works = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
     "Is page attribute table fully functional?");
 
 static int pg_ps_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pg_ps_enabled, 0, "Are large page mappings enabled?");
 
 #define	PAT_INDEX_SIZE	8
 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
 
 /*
  * pmap_mapdev support pre initialization (i.e. console)
  */
 #define	PMAP_PREINIT_MAPPING_COUNT	8
 static struct pmap_preinit_mapping {
 	vm_paddr_t	pa;
 	vm_offset_t	va;
 	vm_size_t	sz;
 	int		mode;
 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 static int pmap_initialized;
 
 static struct rwlock_padalign pvh_global_lock;
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static struct md_page *pv_table;
 static int shpgperproc = PMAP_SHPGPERPROC;
 
 struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
 int pv_maxchunks;			/* How many chunks we have KVA for */
 vm_offset_t pv_vafree;			/* freelist stored in the PTE */
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP3;
 static pd_entry_t *KPTD;
 caddr_t ptvmmap = 0;
 caddr_t CADDR3;
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 static pt_entry_t *PMAP1 = NULL, *PMAP2;
 static pt_entry_t *PADDR1 = NULL, *PADDR2;
 #ifdef SMP
 static int PMAP1cpu;
 static int PMAP1changedcpu;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 
 	   &PMAP1changedcpu, 0,
 	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
 #endif
 static int PMAP1changed;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 
 	   &PMAP1changed, 0,
 	   "Number of times pmap_pte_quick changed PMAP1");
 static int PMAP1unchanged;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 
 	   &PMAP1unchanged, 0,
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
 int pti;
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 #if VM_NRESERVLEVEL > 0
 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 #endif
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
 
 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
 static void pmap_flush_page(vm_page_t m);
 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
 		    pd_entry_t pde);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
 #if VM_NRESERVLEVEL > 0
 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 #endif
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     struct spglist *free);
 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
     struct spglist *free);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 					vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
 
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
 static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
 static void pmap_pte_release(pt_entry_t *pte);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
 #if defined(PAE) || defined(PAE_TABLES)
 static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain,
     uint8_t *flags, int wait);
 #endif
-static void pmap_set_pg(void);
+static void pmap_init_trm(void);
 
 static __inline void pagezero(void *page);
 
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
+void pmap_cold(void);
+extern char _end[];
+u_long physfree;	/* phys addr of next free page */
+u_long vm86phystk;	/* PA of vm86/bios stack */
+u_long vm86paddr;	/* address of vm86 region */
+int vm86pa;		/* phys addr of vm86 region */
+u_long KERNend;		/* phys addr end of kernel (just after bss) */
+pd_entry_t *IdlePTD;	/* phys addr of kernel PTD */
+#if defined(PAE) || defined(PAE_TABLES)
+pdpt_entry_t *IdlePDPT;	/* phys addr of kernel PDPT */
+#endif
+pt_entry_t *KPTmap;	/* address of kernel page tables */
+u_long KPTphys;		/* phys addr of kernel page tables */
+
+static u_long
+allocpages(u_int cnt, u_long *physfree)
+{
+	u_long res;
+
+	res = *physfree;
+	*physfree += PAGE_SIZE * cnt;
+	bzero((void *)res, PAGE_SIZE * cnt);
+	return (res);
+}
+
+static void
+pmap_cold_map(u_long pa, u_long va, u_long cnt)
+{
+	pt_entry_t *pt;
+
+	for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0;
+	    cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE)
+		*pt = pa | PG_V | PG_RW | PG_A | PG_M;
+}
+
+static void
+pmap_cold_mapident(u_long pa, u_long cnt)
+{
+
+	pmap_cold_map(pa, pa, cnt);
+}
+
+_Static_assert(2 * NBPDR == KERNBASE, "Broken double-map of zero PTD");
+
 /*
- * If you get an error here, then you set KVA_PAGES wrong! See the
- * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
- * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
+ * Called from locore.s before paging is enabled.  Sets up the first
+ * kernel page table.  Since kernel is mapped with PA == VA, this code
+ * does not require relocations.
  */
-CTASSERT(KERNBASE % (1 << 24) == 0);
+void
+pmap_cold(void)
+{
+	pt_entry_t *pt;
+	u_long a;
+	u_int cr3, ncr4;
 
+	physfree = (u_long)&_end;
+	if (bootinfo.bi_esymtab != 0)
+		physfree = bootinfo.bi_esymtab;
+	if (bootinfo.bi_kernend != 0)
+		physfree = bootinfo.bi_kernend;
+	physfree = roundup2(physfree, NBPDR);
+	KERNend = physfree;
+
+	/* Allocate Kernel Page Tables */
+	KPTphys = allocpages(NKPT, &physfree);
+	KPTmap = (pt_entry_t *)KPTphys;
+
+	/* Allocate Page Table Directory */
+#if defined(PAE) || defined(PAE_TABLES)
+	/* XXX only need 32 bytes (easier for now) */
+	IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree);
+#endif
+	IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree);
+
+	/*
+	 * Allocate KSTACK.  Leave a guard page between IdlePTD and
+	 * proc0kstack, to control stack overflow for thread0 and
+	 * prevent corruption of the page table.  We leak the guard
+	 * physical memory due to 1:1 mappings.
+	 */
+	allocpages(1, &physfree);
+	proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree);
+
+	/* vm86/bios stack */
+	vm86phystk = allocpages(1, &physfree);
+
+	/* pgtable + ext + IOPAGES */
+	vm86paddr = vm86pa = allocpages(3, &physfree);
+
+	/* Install page tables into PTD.  Page table page 1 is wasted. */
+	for (a = 0; a < NKPT; a++)
+		IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M;
+
+#if defined(PAE) || defined(PAE_TABLES)
+	/* PAE install PTD pointers into PDPT */
+	for (a = 0; a < NPGPTD; a++)
+		IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V;
+#endif
+
+	/*
+	 * Install recursive mapping for kernel page tables into
+	 * itself.
+	 */
+	for (a = 0; a < NPGPTD; a++)
+		IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V |
+		    PG_RW;
+
+	/*
+	 * Initialize page table pages mapping physical address zero
+	 * through the (physical) end of the kernel.  Many of these
+	 * pages must be reserved, and we reserve them all and map
+	 * them linearly for convenience.  We do this even if we've
+	 * enabled PSE above; we'll just switch the corresponding
+	 * kernel PDEs before we turn on paging.
+	 *
+	 * This and all other page table entries allow read and write
+	 * access for various reasons.  Kernel mappings never have any
+	 * access restrictions.
+	 */
+	pmap_cold_mapident(0, atop(NBPDR));
+	pmap_cold_map(0, NBPDR, atop(NBPDR));
+	pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE));
+
+	/* Map page table directory */
+#if defined(PAE) || defined(PAE_TABLES)
+	pmap_cold_mapident((u_long)IdlePDPT, 1);
+#endif
+	pmap_cold_mapident((u_long)IdlePTD, NPGPTD);
+
+	/* Map early KPTmap.  It is really pmap_cold_mapident. */
+	pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT);
+
+	/* Map proc0kstack */
+	pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES);
+	/* ISA hole already mapped */
+
+	pmap_cold_mapident(vm86phystk, 1);
+	pmap_cold_mapident(vm86pa, 3);
+
+	/* Map page 0 into the vm86 page table */
+	*(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V;
+
+	/* ...likewise for the ISA hole for vm86 */
+	for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0;
+	    a < atop(ISA_HOLE_LENGTH); a++, pt++)
+		*pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A |
+		    PG_M | PG_V;
+
+	/* Enable PSE, PGE, VME, and PAE if configured. */
+	ncr4 = 0;
+	if ((cpu_feature & CPUID_PSE) != 0) {
+		ncr4 |= CR4_PSE;
+		/*
+		 * Superpage mapping of the kernel text.  Existing 4k
+		 * page table pages are wasted.
+		 */
+		for (a = KERNBASE; a < KERNend; a += NBPDR)
+			IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M |
+			    PG_RW | PG_V;
+	}
+	if ((cpu_feature & CPUID_PGE) != 0) {
+		ncr4 |= CR4_PGE;
+		pgeflag = PG_G;
+	}
+	ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0;
+#if defined(PAE) || defined(PAE_TABLES)
+	ncr4 |= CR4_PAE;
+#endif
+	if (ncr4 != 0)
+		load_cr4(rcr4() | ncr4);
+
+	/* Now enable paging */
+#if defined(PAE) || defined(PAE_TABLES)
+	cr3 = (u_int)IdlePDPT;
+#else
+	cr3 = (u_int)IdlePTD;
+#endif
+	load_cr3(cr3);
+	load_cr0(rcr0() | CR0_PG);
+
+	/*
+	 * Now running relocated at KERNBASE where the system is
+	 * linked to run.
+	 */
+
+	/*
+	 * Remove the lowest part of the double mapping of low memory
+	 * to get some null pointer checks.
+	 */
+	IdlePTD[0] = 0;
+	load_cr3(cr3);		/* invalidate TLB */
+}
+
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after mapping has already been enabled
+ *	in locore.s with the page table created in pmap_cold(),
  *	and just syncs the pmap module with what has already been done.
- *	[We can't call it easily with mapping off since the kernel is not
- *	mapped with PA == VA, hence we would have to relocate every address
- *	from the linked base (virtual) address "KERNBASE" to the actual
- *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(vm_paddr_t firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *unused;
 	struct pcpu *pc;
 	int i;
 
 	/*
 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
 	 * preallocated kernel page table pages so that vm_page structures
 	 * representing these pages will be created.  The vm_page structures
 	 * are required for promotion of the corresponding kernel virtual
 	 * addresses to superpage mappings.
 	 */
 	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 
 	/*
 	 * Initialize the first available kernel virtual address.  However,
 	 * using "firstaddr" may waste a few pages of the kernel virtual
 	 * address space, because locore may not have mapped every physical
 	 * page that it allocated.  Preferably, locore would provide a first
 	 * unused virtual address in addition to "firstaddr".
 	 */
-	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
+	virtual_avail = (vm_offset_t)firstaddr;
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
-	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
+	kernel_pmap->pm_pdir = IdlePTD;
 #if defined(PAE) || defined(PAE_TABLES)
-	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
+	kernel_pmap->pm_pdpt = IdlePDPT;
 #endif
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 
  	/*
 	 * Initialize the global pv list lock.
 	 */
 	rw_init(&pvh_global_lock, "pmap pv global");
 
-	LIST_INIT(&allpmaps);
-
 	/*
-	 * Request a spin mutex so that changes to allpmaps cannot be
-	 * preempted by smp_rendezvous_cpus().  Otherwise,
-	 * pmap_update_pde_kernel() could access allpmaps while it is
-	 * being changed.
-	 */
-	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
-	mtx_lock_spin(&allpmaps_lock);
-	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
-	mtx_unlock_spin(&allpmaps_lock);
-
-	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 
 	/*
 	 * Initialize temporary map objects on the current CPU for use
 	 * during early boot.
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 * CMAP3 is used for the boot-time memory test.
 	 */
 	pc = get_pcpu();
 	mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
 	SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1)
 	SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1)
 	SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1)
 
 	SYSMAP(caddr_t, CMAP3, CADDR3, 1);
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
 
 	/*
 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 	 */
 	SYSMAP(caddr_t, unused, ptvmmap, 1)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 */
 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
 
 	/*
 	 * KPTmap is used by pmap_kextract().
 	 *
 	 * KPTmap is first initialized by locore.  However, that initial
 	 * KPTmap can only support NKPT page table pages.  Here, a larger
 	 * KPTmap is created that can support KVA_PAGES page table pages.
 	 */
 	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
 
 	for (i = 0; i < NKPT; i++)
-		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
+		KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
 
 	/*
-	 * Adjust the start of the KPTD and KPTmap so that the implementation
-	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
-	 */
-	KPTD -= KPTDI;
-	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
-
-	/*
 	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
 	 * respectively.
 	 */
 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
 	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
 
 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
 
 	virtual_avail = va;
 
 	/*
-	 * Finish removing the identity mapping (virt == phys) of low memory.
-	 * It was only used for 2 instructions in locore.  locore then
-	 * unmapped the first PTD to get some null pointer checks.  ACPI
-	 * wakeup will map the first PTD transiently to use it for 1
-	 * instruction.  The double mapping for low memory is not usable in
-	 * normal operation since it breaks trapping of null pointers and
-	 * causes inconsistencies in page tables when combined with PG_G.
-	 */
-	for (i = 1; i < NKPT; i++)
-		PTD[i] = 0;
-
-	/*
 	 * Initialize the PAT MSR if present.
 	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
 	 * side-effect, invalidates stale PG_G TLB entries that might
 	 * have been created in our pre-boot environment.  We assume
 	 * that PAT support implies PGE and in reverse, PGE presence
 	 * comes with PAT.  Both features were added for Pentium Pro.
 	 */
 	pmap_init_pat();
-
-	/* Turn on PG_G on kernel page(s) */
-	pmap_set_pg();
 }
 
 static void
 pmap_init_reserved_pages(void)
 {
 	struct pcpu *pc;
 	vm_offset_t pages;
 	int i;
 
 	CPU_FOREACH(i) {
 		pc = pcpu_find(i);
+		mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF |
+		    MTX_NEW);
+		pc->pc_copyout_maddr = kva_alloc(ptoa(2));
+		if (pc->pc_copyout_maddr == 0)
+			panic("unable to allocate non-sleepable copyout KVA");
+		sx_init(&pc->pc_copyout_slock, "cpslk");
+		pc->pc_copyout_saddr = kva_alloc(ptoa(2));
+		if (pc->pc_copyout_saddr == 0)
+			panic("unable to allocate sleepable copyout KVA");
+
 		/*
-		 * Skip if the mapping has already been initialized,
+		 * Skip if the mappings have already been initialized,
 		 * i.e. this is the BSP.
 		 */
 		if (pc->pc_cmap_addr1 != 0)
 			continue;
+
 		mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
 		pages = kva_alloc(PAGE_SIZE * 3);
 		if (pages == 0)
-			panic("%s: unable to allocate KVA", __func__);
+			panic("unable to allocate CMAP KVA");
 		pc->pc_cmap_pte1 = vtopte(pages);
 		pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE);
 		pc->pc_cmap_addr1 = (caddr_t)pages;
 		pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE);
-		pc->pc_qmap_addr = pages + (PAGE_SIZE * 2);
+		pc->pc_qmap_addr = pages + atop(2);
 	}
 }
  
 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL);
 
 /*
  * Setup the PAT MSR.
  */
 void
 pmap_init_pat(void)
 {
 	int pat_table[PAT_INDEX_SIZE];
 	uint64_t pat_msr;
 	u_long cr0, cr4;
 	int i;
 
 	/* Set default PAT index table. */
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_table[i] = -1;
 	pat_table[PAT_WRITE_BACK] = 0;
 	pat_table[PAT_WRITE_THROUGH] = 1;
 	pat_table[PAT_UNCACHEABLE] = 3;
 	pat_table[PAT_WRITE_COMBINING] = 3;
 	pat_table[PAT_WRITE_PROTECTED] = 3;
 	pat_table[PAT_UNCACHED] = 3;
 
 	/*
 	 * Bail if this CPU doesn't implement PAT.
 	 * We assume that PAT support implies PGE.
 	 */
 	if ((cpu_feature & CPUID_PAT) == 0) {
 		for (i = 0; i < PAT_INDEX_SIZE; i++)
 			pat_index[i] = pat_table[i];
 		pat_works = 0;
 		return;
 	}
 
 	/*
 	 * Due to some Intel errata, we can only safely use the lower 4
 	 * PAT entries.
 	 *
 	 *   Intel Pentium III Processor Specification Update
 	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
 	 * or Mode C Paging)
 	 *
 	 *   Intel Pentium IV  Processor Specification Update
 	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
 		pat_works = 0;
 
 	/* Initialize default PAT entries. */
 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(2, PAT_UNCACHED) |
 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
 	    PAT_VALUE(4, PAT_WRITE_BACK) |
 	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(6, PAT_UNCACHED) |
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 
 	if (pat_works) {
 		/*
 		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
 		 * Program 5 and 6 as WP and WC.
 		 * Leave 4 and 7 as WB and UC.
 		 */
 		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
 		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
 		    PAT_VALUE(6, PAT_WRITE_COMBINING);
 		pat_table[PAT_UNCACHED] = 2;
 		pat_table[PAT_WRITE_PROTECTED] = 5;
 		pat_table[PAT_WRITE_COMBINING] = 6;
 	} else {
 		/*
 		 * Just replace PAT Index 2 with WC instead of UC-.
 		 */
 		pat_msr &= ~PAT_MASK(2);
 		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
 		pat_table[PAT_WRITE_COMBINING] = 2;
 	}
 
 	/* Disable PGE. */
 	cr4 = rcr4();
 	load_cr4(cr4 & ~CR4_PGE);
 
 	/* Disable caches (CD = 1, NW = 0). */
 	cr0 = rcr0();
 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
 
 	/* Flushes caches and TLBs. */
 	wbinvd();
 	invltlb();
 
 	/* Update PAT and index table. */
 	wrmsr(MSR_PAT, pat_msr);
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_index[i] = pat_table[i];
 
 	/* Flush caches and TLBs again. */
 	wbinvd();
 	invltlb();
 
 	/* Restore caches and PGE. */
 	load_cr0(cr0);
 	load_cr4(cr4);
 }
 
 /*
- * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
- */
-static void
-pmap_set_pg(void)
-{
-	pt_entry_t *pte;
-	vm_offset_t va, endva;
-
-	if (pgeflag == 0)
-		return;
-
-	endva = KERNBASE + KERNend;
-
-	if (pseflag) {
-		va = KERNBASE + roundup2(KERNLOAD, NBPDR);
-		while (va  < endva) {
-			pdir_pde(PTD, va) |= pgeflag;
-			invltlb();	/* Flush non-PG_G entries. */
-			va += NBPDR;
-		}
-	} else {
-		va = (vm_offset_t)btext;
-		while (va < endva) {
-			pte = vtopte(va);
-			if (*pte)
-				*pte |= pgeflag;
-			invltlb();	/* Flush non-PG_G entries. */
-			va += PAGE_SIZE;
-		}
-	}
-}
-
-/*
  * Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pat_mode = PAT_WRITE_BACK;
 }
 
 #if defined(PAE) || defined(PAE_TABLES)
 static void *
 pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
     int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */
 	*flags = UMA_SLAB_KERNEL;
 	return ((void *)kmem_alloc_contig_domain(domain, bytes, wait, 0x0ULL,
 	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
 }
 #endif
 
 /*
  * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
  * Requirements:
  *  - Must deal with pages in order to ensure that none of the PG_* bits
  *    are ever set, PG_V in particular.
  *  - Assumes we can write to ptes without pte_store() atomic ops, even
  *    on PAE systems.  This should be ok.
  *  - Assumes nothing will ever test these addresses for 0 to indicate
  *    no mapping instead of correctly checking PG_V.
  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
  * Because PG_V is never set, there can be no mappings to invalidate.
  */
 static vm_offset_t
 pmap_ptelist_alloc(vm_offset_t *head)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 
 	va = *head;
 	if (va == 0)
 		panic("pmap_ptelist_alloc: exhausted ptelist KVA");
 	pte = vtopte(va);
 	*head = *pte;
 	if (*head & PG_V)
 		panic("pmap_ptelist_alloc: va with PG_V set!");
 	*pte = 0;
 	return (va);
 }
 
 static void
 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	if (va & PG_V)
 		panic("pmap_ptelist_free: freeing va with PG_V set!");
 	pte = vtopte(va);
 	*pte = *head;		/* virtual! PG_V is 0 though */
 	*head = va;
 }
 
 static void
 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
 {
 	int i;
 	vm_offset_t va;
 
 	*head = 0;
 	for (i = npages - 1; i >= 0; i--) {
 		va = (vm_offset_t)base + i * PAGE_SIZE;
 		pmap_ptelist_free(head, va);
 	}
 }
 
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t mpte;
 	vm_size_t s;
 	int i, pv_npg;
 
 	/*
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */ 
 	for (i = 0; i < NKPT; i++) {
-		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
+		mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i));
 		KASSERT(mpte >= vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
 		    ("pmap_init: page table page is out of range"));
 		mpte->pindex = i + KPTDI;
-		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
+		mpte->phys_addr = KPTphys + ptoa(i);
 	}
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_max = roundup(pv_entry_max, _NPCPV);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	/*
 	 * If the kernel is running on a virtual machine, then it must assume
 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
 	 * be prepared for the hypervisor changing the vendor and family that
 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
 	 * 10h Erratum 383 is enabled if the processor's feature set does not
 	 * include at least one feature that is only supported by older Intel
 	 * or newer AMD processors.
 	 */
 	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
 	    AMDID2_FMA4)) == 0)
 		workaround_erratum383 = 1;
 
 	/*
 	 * Are large page mappings supported and enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 	if (pseflag == 0)
 		pg_ps_enabled = 0;
 	else if (pg_ps_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = NBPDR;
 	}
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 * Handle the possibility that "vm_phys_segs[...].end" is zero.
 	 */
 	pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
 	    PAGE_SIZE) / NBPDR + 1;
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 
 	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
 	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
 	if (pv_chunkbase == NULL)
 		panic("pmap_init: not enough kvm for pv chunks");
 	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
 #if defined(PAE) || defined(PAE_TABLES)
 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
 #endif
 
 	pmap_initialized = 1;
+	pmap_init_trm();
+
 	if (!bootverbose)
 		return;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == 0)
 			continue;
 		printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
 		    (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
 	}
+
 }
 
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
 	"Max number of PV entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
 	"Page share factor per proc");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
     "2/4MB page mapping counters");
 
 static u_long pmap_pde_demotions;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_pde_demotions, 0, "2/4MB page demotions");
 
 static u_long pmap_pde_mappings;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_pde_mappings, 0, "2/4MB page mappings");
 
 static u_long pmap_pde_p_failures;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
 
 static u_long pmap_pde_promotions;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_pde_promotions, 0, "2/4MB page promotions");
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 int
 pmap_cache_bits(int mode, boolean_t is_pde)
 {
 	int cache_bits, pat_flag, pat_idx;
 
 	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
 		panic("Unknown caching mode %d\n", mode);
 
 	/* The PAT bit is different for PTE's and PDE's. */
 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
 
 	/* Map the caching mode to a PAT index. */
 	pat_idx = pat_index[mode];
 
 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 	cache_bits = 0;
 	if (pat_idx & 0x4)
 		cache_bits |= pat_flag;
 	if (pat_idx & 0x2)
 		cache_bits |= PG_NC_PCD;
 	if (pat_idx & 0x1)
 		cache_bits |= PG_NC_PWT;
 	return (cache_bits);
 }
 
 /*
  * The caller is responsible for maintaining TLB consistency.
  */
 static void
 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
 {
 	pd_entry_t *pde;
-	pmap_t pmap;
-	boolean_t PTD_updated;
 
-	PTD_updated = FALSE;
-	mtx_lock_spin(&allpmaps_lock);
-	LIST_FOREACH(pmap, &allpmaps, pm_list) {
-		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
-		    PG_FRAME))
-			PTD_updated = TRUE;
-		pde = pmap_pde(pmap, va);
-		pde_store(pde, newpde);
-	}
-	mtx_unlock_spin(&allpmaps_lock);
-	KASSERT(PTD_updated,
-	    ("pmap_kenter_pde: current page table is not in allpmaps"));
+	pde = pmap_pde(kernel_pmap, va);
+	pde_store(pde, newpde);
 }
 
 /*
  * After changing the page size for the specified virtual address in the page
  * table, flush the corresponding entries from the processor's TLB.  Only the
  * calling processor's TLB is affected.
  *
  * The calling thread must be pinned to a processor.
  */
 static void
 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
 {
-	u_long cr4;
 
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
 		invlpg(va);
-	else if ((newpde & PG_G) == 0)
+	else /* if ((newpde & PG_G) == 0) */
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB
 		 * because there are too many to flush individually.
 		 */
 		invltlb();
-	else {
-		/*
-		 * Promotion: flush every 4KB page mapping from the TLB,
-		 * including any global (PG_G) mappings.
-		 */
-		cr4 = rcr4();
-		load_cr4(cr4 & ~CR4_PGE);
-		/*
-		 * Although preemption at this point could be detrimental to
-		 * performance, it would not lead to an error.  PG_G is simply
-		 * ignored if CR4.PGE is clear.  Moreover, in case this block
-		 * is re-entered, the load_cr4() either above or below will
-		 * modify CR4.PGE flushing the TLB.
-		 */
-		load_cr4(cr4 | CR4_PGE);
-	}
 }
 
 void
 invltlb_glob(void)
 {
-	uint64_t cr4;
 
-	if (pgeflag == 0) {
-		invltlb();
-	} else {
-		cr4 = rcr4();
-		load_cr4(cr4 & ~CR4_PGE);
-		load_cr4(cr4 | CR4_PGE);
-	}
+	invltlb();
 }
 
 
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  *
  * N.B.: Before calling any of the following TLB invalidation functions,
  * the calling processor must ensure that all stores updating a non-
  * kernel page table are globally performed.  Otherwise, another
  * processor could cache an old, pre-update entry without being
  * invalidated.  This can happen one of two ways: (1) The pmap becomes
  * active on another processor after its pm_active field is checked by
  * one of the following functions but before a store updating the page
  * table is globally performed. (2) The pmap becomes active on another
  * processor before its pm_active field is checked but due to
  * speculative loads one of the following functions stills reads the
  * pmap as inactive on the other processor.
  * 
  * The kernel page table is exempt because its pm_active field is
  * immutable.  The kernel page table is always active on every
  * processor.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	cpuset_t *mask, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
-	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+	if (pmap == kernel_pmap) {
 		invlpg(va);
 		mask = &all_cpus;
+	} else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
+		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
-		if (CPU_ISSET(cpuid, &pmap->pm_active))
-			invlpg(va);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
 	smp_masked_invlpg(*mask, va, pmap);
 	sched_unpin();
 }
 
 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	cpuset_t *mask, other_cpus;
 	vm_offset_t addr;
 	u_int cpuid;
 
 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 		pmap_invalidate_all(pmap);
 		return;
 	}
 
 	sched_pin();
-	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+	if (pmap == kernel_pmap) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		mask = &all_cpus;
+	} else  if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
+		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
-		if (CPU_ISSET(cpuid, &pmap->pm_active))
-			for (addr = sva; addr < eva; addr += PAGE_SIZE)
-				invlpg(addr);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
 	smp_masked_invlpg_range(*mask, sva, eva, pmap);
 	sched_unpin();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	cpuset_t *mask, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
-		invltlb_glob();
+		invltlb();
 		mask = &all_cpus;
 	} else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
-		invltlb();
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
-		if (CPU_ISSET(cpuid, &pmap->pm_active))
-			invltlb();
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
 	smp_masked_invltlb(*mask, pmap);
 	sched_unpin();
 }
 
 void
 pmap_invalidate_cache(void)
 {
 
 	sched_pin();
 	wbinvd();
 	smp_cache_flush();
 	sched_unpin();
 }
 
 struct pde_action {
 	cpuset_t invalidate;	/* processors that invalidate their TLB */
 	vm_offset_t va;
 	pd_entry_t *pde;
 	pd_entry_t newpde;
 	u_int store;		/* processor that updates the PDE */
 };
 
 static void
 pmap_update_pde_kernel(void *arg)
 {
 	struct pde_action *act = arg;
 	pd_entry_t *pde;
-	pmap_t pmap;
 
 	if (act->store == PCPU_GET(cpuid)) {
-
-		/*
-		 * Elsewhere, this operation requires allpmaps_lock for
-		 * synchronization.  Here, it does not because it is being
-		 * performed in the context of an all_cpus rendezvous.
-		 */
-		LIST_FOREACH(pmap, &allpmaps, pm_list) {
-			pde = pmap_pde(pmap, act->va);
-			pde_store(pde, act->newpde);
-		}
+		pde = pmap_pde(kernel_pmap, act->va);
+		pde_store(pde, act->newpde);
 	}
 }
 
 static void
 pmap_update_pde_user(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (act->store == PCPU_GET(cpuid))
 		pde_store(act->pde, act->newpde);
 }
 
 static void
 pmap_update_pde_teardown(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 		pmap_update_pde_invalidate(act->va, act->newpde);
 }
 
 /*
  * Change the page size for the specified virtual address in a way that
  * prevents any possibility of the TLB ever having two entries that map the
  * same virtual address using different page sizes.  This is the recommended
  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
  * machine check exception for a TLB state that is improperly diagnosed as a
  * hardware error.
  */
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 	struct pde_action act;
 	cpuset_t active, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	other_cpus = all_cpus;
 	CPU_CLR(cpuid, &other_cpus);
 	if (pmap == kernel_pmap)
 		active = all_cpus;
 	else
 		active = pmap->pm_active;
 	if (CPU_OVERLAP(&active, &other_cpus)) {
 		act.store = cpuid;
 		act.invalidate = active;
 		act.va = va;
 		act.pde = pde;
 		act.newpde = newpde;
 		CPU_SET(cpuid, &active);
 		smp_rendezvous_cpus(active,
 		    smp_no_rendezvous_barrier, pmap == kernel_pmap ?
 		    pmap_update_pde_kernel : pmap_update_pde_user,
 		    pmap_update_pde_teardown, &act);
 	} else {
 		if (pmap == kernel_pmap)
 			pmap_kenter_pde(va, newpde);
 		else
 			pde_store(pde, newpde);
 		if (CPU_ISSET(cpuid, &active))
 			pmap_update_pde_invalidate(va, newpde);
 	}
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, 486+ invalidation functions.
  * We inline these within pmap.c for speed.
  */
 PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
-	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+	if (pmap == kernel_pmap)
 		invlpg(va);
 }
 
 PMAP_INLINE void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
-	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+	if (pmap == kernel_pmap)
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap)
-		invltlb_glob();
-	else if (!CPU_EMPTY(&pmap->pm_active))
 		invltlb();
 }
 
 PMAP_INLINE void
 pmap_invalidate_cache(void)
 {
 
 	wbinvd();
 }
 
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	if (pmap == kernel_pmap)
 		pmap_kenter_pde(va, newpde);
 	else
 		pde_store(pde, newpde);
 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 		pmap_update_pde_invalidate(va, newpde);
 }
 #endif /* !SMP */
 
 static void
 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
 {
 
 	/*
 	 * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was
 	 * created by a promotion that did not invalidate the 512 or 1024 4KB
 	 * page mappings that might exist in the TLB.  Consequently, at this
 	 * point, the TLB may hold both 4KB and 2- or 4MB page mappings for
 	 * the address range [va, va + NBPDR).  Therefore, the entire range
 	 * must be invalidated here.  In contrast, when PG_PROMOTED is clear,
 	 * the TLB will not hold any 4KB page mappings for the address range
 	 * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the
 	 * 2- or 4MB page mapping from the TLB.
 	 */
 	if ((pde & PG_PROMOTED) != 0)
 		pmap_invalidate_range(pmap, va, va + NBPDR - 1);
 	else
 		pmap_invalidate_page(pmap, va);
 }
 
 #define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
 
 void
 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
 {
 
 	if (force) {
 		sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
 	} else {
 		KASSERT((sva & PAGE_MASK) == 0,
 		    ("pmap_invalidate_cache_range: sva not page-aligned"));
 		KASSERT((eva & PAGE_MASK) == 0,
 		    ("pmap_invalidate_cache_range: eva not page-aligned"));
 	}
 
 	if ((cpu_feature & CPUID_SS) != 0 && !force)
 		; /* If "Self Snoop" is supported and allowed, do nothing. */
 	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 #ifdef DEV_APIC
 		/*
 		 * XXX: Some CPUs fault, hang, or trash the local APIC
 		 * registers if we use CLFLUSH on the local APIC
 		 * range.  The local APIC is always uncached, so we
 		 * don't need to flush for that range anyway.
 		 */
 		if (pmap_kextract(sva) == lapic_paddr)
 			return;
 #endif
 		/*
 		 * Otherwise, do per-cache line flush.  Use the sfence
 		 * instruction to insure that previous stores are
 		 * included in the write-back.  The processor
 		 * propagates flush to other processors in the cache
 		 * coherence domain.
 		 */
 		sfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflushopt(sva);
 		sfence();
 	} else if ((cpu_feature & CPUID_CLFSH) != 0 &&
 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 #ifdef DEV_APIC
 		if (pmap_kextract(sva) == lapic_paddr)
 			return;
 #endif
 		/*
 		 * Writes are ordered by CLFLUSH on Intel CPUs.
 		 */
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflush(sva);
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 	} else {
 
 		/*
 		 * No targeted cache flush methods are supported by CPU,
 		 * or the supplied range is bigger than 2MB.
 		 * Globally invalidate cache.
 		 */
 		pmap_invalidate_cache();
 	}
 }
 
 void
 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 {
 	int i;
 
 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 	    (cpu_feature & CPUID_CLFSH) == 0) {
 		pmap_invalidate_cache();
 	} else {
 		for (i = 0; i < count; i++)
 			pmap_flush_page(pages[i]);
 	}
 }
 
 /*
  * Are we current address space or kernel?
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 
-	return (pmap == kernel_pmap || pmap ==
-	    vmspace_pmap(curthread->td_proc->p_vmspace));
+	return (pmap == kernel_pmap);
 }
 
 /*
  * If the given pmap is not the current or kernel pmap, the returned pte must
  * be released by passing it to pmap_pte_release().
  */
 pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		mtx_lock(&PMAP2mutex);
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP2 & PG_FRAME) != newpf) {
 			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 		}
 		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (NULL);
 }
 
 /*
  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
  * being NULL.
  */
 static __inline void
 pmap_pte_release(pt_entry_t *pte)
 {
 
 	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
 		mtx_unlock(&PMAP2mutex);
 }
 
 /*
  * NB:  The sequence of updating a page table followed by accesses to the
  * corresponding pages is subject to the situation described in the "AMD64
  * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
  * "7.3.1 Special Coherency Considerations".  Therefore, issuing the INVLPG
  * right after modifying the PTE bits is crucial.
  */
 static __inline void
 invlcaddr(void *caddr)
 {
 
 	invlpg((u_int)caddr);
 }
 
 /*
  * Super fast pmap_pte routine best used when scanning
  * the pv lists.  This eliminates many coarse-grained
  * invltlb calls.  Note that many of the pv list
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  *
  * If the given pmap is not the current pmap, pvh_global_lock
  * must be held and curthread pinned to a CPU.
  */
 static pt_entry_t *
 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		rw_assert(&pvh_global_lock, RA_WLOCKED);
 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP1 & PG_FRAME) != newpf) {
 			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			invlcaddr(PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			invlcaddr(PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (0);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t rtval;
 	pt_entry_t *pte;
 	pd_entry_t pde;
 
 	rtval = 0;
 	PMAP_LOCK(pmap);
 	pde = pmap->pm_pdir[va >> PDRSHIFT];
 	if (pde != 0) {
 		if ((pde & PG_PS) != 0)
 			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
 		else {
 			pte = pmap_pte(pmap, va);
 			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
 			pmap_pte_release(pte);
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (rtval);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde;
 	pt_entry_t pte, *ptep;
 	vm_page_t m;
 	vm_paddr_t pa;
 
 	pa = 0;
 	m = NULL;
 	PMAP_LOCK(pmap);
 retry:
 	pde = *pmap_pde(pmap, va);
 	if (pde != 0) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 				if (vm_page_pa_tryrelock(pmap, (pde &
 				    PG_PS_FRAME) | (va & PDRMASK), &pa))
 					goto retry;
 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 				    (va & PDRMASK));
 				vm_page_hold(m);
 			}
 		} else {
 			ptep = pmap_pte(pmap, va);
 			pte = *ptep;
 			pmap_pte_release(ptep);
 			if (pte != 0 &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
 				    &pa))
 					goto retry;
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 				vm_page_hold(m);
 			}
 		}
 	}
 	PA_UNLOCK_COND(pa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  *
  * This function may be used before pmap_bootstrap() is called.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
-	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
+	pte_store(pte, pa | PG_RW | PG_V);
 }
 
 static __inline void
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
-	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
+	pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(mode, 0));
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  *
  * This function may be used before pmap_bootstrap() is called.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	vm_offset_t va, sva;
 	vm_paddr_t superpage_offset;
 	pd_entry_t newpde;
 
 	va = *virt;
 	/*
 	 * Does the physical address range's size and alignment permit at
 	 * least one superpage mapping to be created?
 	 */ 
 	superpage_offset = start & PDRMASK;
 	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
 		/*
 		 * Increase the starting virtual address so that its alignment
 		 * does not preclude the use of superpage mappings.
 		 */
 		if ((va & PDRMASK) < superpage_offset)
 			va = (va & ~PDRMASK) + superpage_offset;
 		else if ((va & PDRMASK) > superpage_offset)
 			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
 	}
 	sva = va;
 	while (start < end) {
 		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
 		    pseflag) {
 			KASSERT((va & PDRMASK) == 0,
 			    ("pmap_map: misaligned va %#x", va));
-			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
+			newpde = start | PG_PS | PG_RW | PG_V;
 			pmap_kenter_pde(va, newpde);
 			va += NBPDR;
 			start += NBPDR;
 		} else {
 			pmap_kenter(va, start);
 			va += PAGE_SIZE;
 			start += PAGE_SIZE;
 		}
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 	*virt = va;
 	return (sva);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, pa, *pte;
 	vm_page_t m;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		m = *ma++;
 		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
 		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
 #if defined(PAE) || defined(PAE_TABLES)
-			pte_store(pte, pa | pgeflag | pg_nx | PG_RW | PG_V);
+			pte_store(pte, pa | pg_nx | PG_RW | PG_V);
 #else
-			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
+			pte_store(pte, pa | PG_RW | PG_V);
 #endif
 		}
 		pte++;
 	}
 	if (__predict_false((oldpte & PG_V) != 0))
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT));
 }
 
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
 		_pmap_unwire_ptp(pmap, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
 {
 	vm_offset_t pteva;
 
 	/*
 	 * unmap the page table page
 	 */
 	pmap->pm_pdir[m->pindex] = 0;
 	--pmap->pm_stats.resident_count;
 
 	/*
 	 * Do an invltlb to make the invalidated mapping
 	 * take effect immediately.
 	 */
 	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
 	pmap_invalidate_page(pmap, pteva);
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
 {
 	pd_entry_t ptepde;
 	vm_page_t mpte;
 
-	if (va >= VM_MAXUSER_ADDRESS)
+	if (pmap == kernel_pmap)
 		return (0);
 	ptepde = *pmap_pde(pmap, va);
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return (pmap_unwire_ptp(pmap, mpte, free));
 }
 
 /*
  * Initialize the pmap for the swapper process.
  */
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
-	/*
-	 * Since the page table directory is shared with the kernel pmap,
-	 * which is already included in the list "allpmaps", this pmap does
-	 * not need to be inserted into that list.
-	 */
-	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
+	pmap->pm_pdir = IdlePTD;
 #if defined(PAE) || defined(PAE_TABLES)
-	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
+	pmap->pm_pdpt = IdlePDPT;
 #endif
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 int
 pmap_pinit(pmap_t pmap)
 {
-	vm_page_t m, ptdpg[NPGPTD];
-	vm_paddr_t pa;
+	vm_page_t m;
 	int i;
 
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
 	if (pmap->pm_pdir == NULL) {
 		pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
 		if (pmap->pm_pdir == NULL)
 			return (0);
 #if defined(PAE) || defined(PAE_TABLES)
 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
 		    ("pmap_pinit: pdpt misaligned"));
 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
 		    ("pmap_pinit: pdpt above 4g"));
 #endif
 		pmap->pm_root.rt_root = 0;
 	}
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_pinit: pmap has reserved page table page(s)"));
 
 	/*
 	 * allocate the page directory page(s)
 	 */
 	for (i = 0; i < NPGPTD;) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
-		if (m == NULL)
+		if (m == NULL) {
 			vm_wait(NULL);
-		else
-			ptdpg[i++] = m;
+		} else {
+			pmap->pm_ptdpg[i] = m;
+#if defined(PAE) || defined(PAE_TABLES)
+			pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V;
+#endif
+			i++;
+		}
 	}
 
-	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
+	pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++)
-		if ((ptdpg[i]->flags & PG_ZERO) == 0)
+		if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0)
 			pagezero(pmap->pm_pdir + (i * NPDEPG));
 
-	mtx_lock_spin(&allpmaps_lock);
-	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
-	/* Copy the kernel page table directory entries. */
-	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
-	mtx_unlock_spin(&allpmaps_lock);
+	/* Install the trampoline mapping. */
+	pmap->pm_pdir[TRPTDI] = PTD[TRPTDI];
 
-	/* install self-referential address mapping entry(s) */
-	for (i = 0; i < NPGPTD; i++) {
-		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
-		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
-#if defined(PAE) || defined(PAE_TABLES)
-		pmap->pm_pdpt[i] = pa | PG_V;
-#endif
-	}
-
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 
 	return (1);
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
 {
 	vm_paddr_t ptepa;
 	vm_page_t m;
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
 			PMAP_UNLOCK(pmap);
 			rw_wunlock(&pvh_global_lock);
 			vm_wait(NULL);
 			rw_wlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	ptepa = VM_PAGE_TO_PHYS(m);
 	pmap->pm_pdir[ptepindex] =
 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
 {
 	u_int ptepindex;
 	pd_entry_t ptepa;
 	vm_page_t m;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = va >> PDRSHIFT;
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	ptepa = pmap->pm_pdir[ptepindex];
 
 	/*
 	 * This supports switching from a 4MB page to a
 	 * normal 4K page.
 	 */
 	if (ptepa & PG_PS) {
 		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
 		ptepa = pmap->pm_pdir[ptepindex];
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (ptepa) {
 		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has
 		 * been deallocated. 
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, flags);
 		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
 * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
-	vm_page_t m, ptdpg[NPGPTD];
+	vm_page_t m;
 	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("releasing active pmap %p", pmap));
 
-	mtx_lock_spin(&allpmaps_lock);
-	LIST_REMOVE(pmap, pm_list);
-	mtx_unlock_spin(&allpmaps_lock);
-
-	for (i = 0; i < NPGPTD; i++)
-		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
-		    PG_FRAME);
-
-	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
-	    sizeof(*pmap->pm_pdir));
-
 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++) {
-		m = ptdpg[i];
+		m = pmap->pm_ptdpg[i];
 #if defined(PAE) || defined(PAE_TABLES)
 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
 		    ("pmap_release: got wrong ptd page"));
 #endif
 		vm_page_unwire_noq(m);
-		vm_page_free_zero(m);
+		vm_page_free(m);
 	}
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
 
 	return (sysctl_handle_long(oidp, &ksize, 0, req));
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "IU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return (sysctl_handle_long(oidp, &kfree, 0, req));
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "IU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t ptppaddr;
 	vm_page_t nkpg;
 	pd_entry_t newpdir;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	addr = roundup2(addr, NBPDR);
 	if (addr - 1 >= kernel_map->max_offset)
 		addr = kernel_map->max_offset;
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
 
 		if ((nkpg->flags & PG_ZERO) == 0)
 			pmap_zero_page(nkpg);
 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
-		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
+		pdir_pde(KPTD, kernel_vm_end) = newpdir;
 
 		pmap_kenter_pde(kernel_vm_end, newpdir);
 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 			kernel_vm_end = kernel_map->max_offset;
 			break;
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 11);
 CTASSERT(_NPCPV == 336);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
 
 static const uint32_t pc_freemask[_NPCM] = {
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE10
 };
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  */
 static vm_page_t
 pmap_pv_reclaim(pmap_t locked_pmap)
 {
 	struct pch newtail;
 	struct pv_chunk *pc;
 	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint32_t inuse;
 	int bit, field, freed;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	pmap = NULL;
 	m_pc = NULL;
 	SLIST_INIT(&free);
 	TAILQ_INIT(&newtail);
 	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
 	    SLIST_EMPTY(&free))) {
 		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 		if (pmap != pc->pc_pmap) {
 			if (pmap != NULL) {
 				pmap_invalidate_all(pmap);
 				if (pmap != locked_pmap)
 					PMAP_UNLOCK(pmap);
 			}
 			pmap = pc->pc_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
 				pmap = NULL;
 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 				continue;
 			}
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = bsfl(inuse);
 				pv = &pc->pc_pventry[field * 32 + bit];
 				va = pv->pv_va;
 				pde = pmap_pde(pmap, va);
 				if ((*pde & PG_PS) != 0)
 					continue;
 				pte = pmap_pte(pmap, va);
 				tpte = *pte;
 				if ((tpte & PG_W) == 0)
 					tpte = pte_load_clear(pte);
 				pmap_pte_release(pte);
 				if ((tpte & PG_W) != 0)
 					continue;
 				KASSERT(tpte != 0,
 				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
 				    pmap, va));
 				if ((tpte & PG_G) != 0)
 					pmap_invalidate_page(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 					vm_page_dirty(m);
 				if ((tpte & PG_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 			continue;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap->pm_stats.resident_count -= freed;
 		PV_STAT(pv_entry_frees += freed);
 		PV_STAT(pv_entry_spare += freed);
 		pv_entry_count -= freed;
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		for (field = 0; field < _NPCM; field++)
 			if (pc->pc_map[field] != pc_freemask[field]) {
 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
 				    pc_list);
 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 
 				/*
 				 * One freed pv entry in locked_pmap is
 				 * sufficient.
 				 */
 				if (pmap == locked_pmap)
 					goto out;
 				break;
 			}
 		if (field == _NPCM) {
 			PV_STAT(pv_entry_spare -= _NPCPV);
 			PV_STAT(pc_chunk_count--);
 			PV_STAT(pc_chunk_frees++);
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 			pmap_qremove((vm_offset_t)pc, 1);
 			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 			break;
 		}
 	}
 out:
 	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
 	if (pmap != NULL) {
 		pmap_invalidate_all(pmap);
 		if (pmap != locked_pmap)
 			PMAP_UNLOCK(pmap);
 	}
 	if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 	}
 	vm_page_free_pages_toq(&free, true);
 	return (m_pc);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 32;
 	bit = idx % 32;
 	pc->pc_map[field] |= 1ul << bit;
 	for (idx = 0; idx < _NPCM; idx++)
 		if (pc->pc_map[idx] != pc_freemask[idx]) {
 			/*
 			 * 98% of the time, pc is already at the head of the
 			 * list.  If it isn't already, move it to the head.
 			 */
 			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
 			    pc)) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			return;
 		}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 	pmap_qremove((vm_offset_t)pc, 1);
 	vm_page_unwire(m, PQ_NONE);
 	vm_page_free(m);
 	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, boolean_t try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		if (ratecheck(&lastprint, &printinterval))
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entry_max tunable.\n");
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfl(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 32 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			for (field = 0; field < _NPCM; field++)
 				if (pc->pc_map[field] != 0) {
 					PV_STAT(pv_entry_spare--);
 					return (pv);	/* not full, return */
 				}
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 			PV_STAT(pv_entry_spare--);
 			return (pv);
 		}
 	}
 	/*
 	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
 	 * global lock.  If "pv_vafree" is currently non-empty, it will
 	 * remain non-empty until pmap_ptelist_alloc() completes.
 	 */
 	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = pmap_pv_reclaim(pmap);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
 	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
 	pmap_qenter((vm_offset_t)pc, &m, 1);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
 		pc->pc_map[field] = pc_freemask[field];
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
 	return (pv);
 }
 
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			break;
 		}
 	}
 	return (pv);
 }
 
 static void
 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
 
 	/*
 	 * Transfer the 4mpage's pv entry for this mapping to the first
 	 * page's pv list.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = trunc_4mpage(va);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
 		m++;
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("pmap_pv_demote_pde: page %p is not managed", m));
 		va += PAGE_SIZE;
 		pmap_insert_entry(pmap, va, m);
 	} while (va < va_last);
 }
 
 #if VM_NRESERVLEVEL > 0
 static void
 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the
 	 * 4mpage's pv list.  Aside from avoiding the cost of a call
 	 * to get_pv_entry(), a transfer avoids the possibility that
 	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
 	 * removes one of the mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = trunc_4mpage(va);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	struct md_page *pvh;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	pmap_pvh_free(&m->md, pmap, va);
 	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		if (TAILQ_EMPTY(&pvh->pv_list))
 			vm_page_aflag_clear(m, PGA_WRITEABLE);
 	}
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 }
 
 /*
  * Conditionally create a pv entry.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Create the pv entries for each of the pages within a superpage.
  */
 static boolean_t
 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		pvh = pa_to_pvh(pa);
 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Fills a page table page with mappings to consecutive physical pages.
  */
 static void
 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 {
 	pt_entry_t *pte;
 
 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 		*pte = newpte;	
 		newpte += PAGE_SIZE;
 	}
 }
 
 /*
  * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
  * 2- or 4MB page mapping is invalidated.
  */
 static boolean_t
 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 	struct spglist free;
 	vm_offset_t sva;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpde = *pde;
 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 	if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
 	    NULL) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_pde: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * Invalidate the 2- or 4MB page mapping and return
 		 * "failure" if the mapping was never accessed or the
 		 * allocation of the new page table page fails.
 		 */
 		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
 		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
 		    VM_ALLOC_WIRED)) == NULL) {
 			SLIST_INIT(&free);
 			sva = trunc_4mpage(va);
 			pmap_remove_pde(pmap, pde, sva, &free);
 			if ((oldpde & PG_G) == 0)
 				pmap_invalidate_pde_page(pmap, sva, oldpde);
 			vm_page_free_pages_toq(&free, true);
 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
 			    " in pmap %p", va, pmap);
 			return (FALSE);
 		}
-		if (va < VM_MAXUSER_ADDRESS)
+		if (pmap != kernel_pmap)
 			pmap->pm_stats.resident_count++;
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 
 	/*
 	 * If the page mapping is in the kernel's address space, then the
 	 * KPTmap can provide access to the page table page.  Otherwise,
 	 * temporarily map the page table page (mpte) into the kernel's
 	 * address space at either PADDR1 or PADDR2. 
 	 */
-	if (va >= KERNBASE)
+	if (pmap == kernel_pmap)
 		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
 	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
 		if ((*PMAP1 & PG_FRAME) != mptepa) {
 			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			invlcaddr(PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			invlcaddr(PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		firstpte = PADDR1;
 	} else {
 		mtx_lock(&PMAP2mutex);
 		if ((*PMAP2 & PG_FRAME) != mptepa) {
 			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 		}
 		firstpte = PADDR2;
 	}
 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpde & PG_A) != 0,
 	    ("pmap_demote_pde: oldpde is missing PG_A"));
 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pde: oldpde is missing PG_M"));
 	newpte = oldpde & ~PG_PS;
 	if ((newpte & PG_PDE_PAT) != 0)
 		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
 
 	/*
 	 * If the page table page is new, initialize it.
 	 */
 	if (mpte->wire_count == 1) {
 		mpte->wire_count = NPTEPG;
 		pmap_fill_ptp(firstpte, newpte);
 	}
 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 	    ("pmap_demote_pde: firstpte and newpte map different physical"
 	    " addresses"));
 
 	/*
 	 * If the mapping has changed attributes, update the page table
 	 * entries.
 	 */ 
 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 		pmap_fill_ptp(firstpte, newpte);
 	
 	/*
 	 * Demote the mapping.  This pmap is locked.  The old PDE has
 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 	 * set.  Thus, there is no danger of a race with another
 	 * processor changing the setting of PG_A and/or PG_M between
 	 * the read above and the store below. 
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else if (pmap == kernel_pmap)
 		pmap_kenter_pde(va, newpde);
 	else
 		pde_store(pde, newpde);	
 	if (firstpte == PADDR2)
 		mtx_unlock(&PMAP2mutex);
 
 	/*
 	 * Invalidate the recursive mapping of the page table page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 
 	/*
 	 * Demote the pv entry.  This depends on the earlier demotion
 	 * of the mapping.  Specifically, the (re)creation of a per-
 	 * page pv entry might trigger the execution of pmap_collect(),
 	 * which might reclaim a newly (re)created per-page pv entry
 	 * and destroy the associated mapping.  In order to destroy
 	 * the mapping, the PDE must have already changed from mapping
 	 * the 2mpage to referencing the page table page.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
 
 	pmap_pde_demotions++;
 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * Removes a 2- or 4MB page mapping from the kernel pmap.
  */
 static void
 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL)
 		panic("pmap_remove_kernel_pde: Missing pt page.");
 
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
 
 	/*
 	 * Initialize the page table page.
 	 */
 	pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
 
 	/*
 	 * Remove the mapping.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else 
 		pmap_kenter_pde(va, newpde);
 
 	/*
 	 * Invalidate the recursive mapping of the page table page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 }
 
 /*
  * pmap_remove_pde: do the things to unmap a superpage in a process
  */
 static void
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m, mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_remove_pde: sva is not 4mpage aligned"));
 	oldpde = pte_load_clear(pdq);
 	if (oldpde & PG_W)
 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if ((oldpde & PG_G) != 0)
 		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 
 	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 	if (oldpde & PG_MANAGED) {
 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(m);
 			if (oldpde & PG_A)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_pde(pmap, pdq, sva);
 	} else {
 		mpte = pmap_remove_pt_page(pmap, sva);
 		if (mpte != NULL) {
 			pmap->pm_stats.resident_count--;
 			KASSERT(mpte->wire_count == NPTEPG,
 			    ("pmap_remove_pde: pte page wire count error"));
 			mpte->wire_count = 0;
 			pmap_add_delayed_free_list(mpte, free, FALSE);
 		}
 	}
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
     struct spglist *free)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	KASSERT(oldpte != 0,
 	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		pmap_invalidate_page(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 	return (pmap_unuse_pt(pmap, va, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
 {
 	pt_entry_t *pte;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
 		return;
 	pmap_remove_pte(pmap, pte, va, free);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	struct spglist free;
 	int anyvalid;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 	SLIST_INIT(&free);
 
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if ((sva + PAGE_SIZE == eva) && 
 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
 		pmap_remove_page(pmap, sva, &free);
 		goto out;
 	}
 
 	for (; sva < eva; sva = pdnxt) {
 		u_int pdirindex;
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we removing the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_remove_pde().
 				 */
 				if ((ptpaddr & PG_G) == 0)
 					anyvalid = 1;
 				pmap_remove_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva, &free);
 				continue;
 			} else if (!pmap_demote_pde(pmap,
 			    &pmap->pm_pdir[pdirindex], sva)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			}
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if (*pte == 0)
 				continue;
 
 			/*
 			 * The TLB entry for a PG_G mapping is invalidated
 			 * by pmap_remove_pte().
 			 */
 			if ((*pte & PG_G) == 0)
 				anyvalid = 1;
 			if (pmap_remove_pte(pmap, pte, sva, &free))
 				break;
 		}
 	}
 out:
 	sched_unpin();
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	struct spglist free;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, va);
 		(void)pmap_demote_pde(pmap, pde, va);
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count--;
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 		    " a 4mpage in page %p's pv list", m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
 		    pmap, pv->pv_va));
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * pmap_protect_pde: do the things to protect a 4mpage in a process
  */
 static boolean_t
 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 {
 	pd_entry_t newpde, oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m;
 	boolean_t anychanged;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_protect_pde: sva is not 4mpage aligned"));
 	anychanged = FALSE;
 retry:
 	oldpde = newpde = *pde;
 	if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 	    (PG_MANAGED | PG_M | PG_RW)) {
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++)
 			vm_page_dirty(m);
 	}
 	if ((prot & VM_PROT_WRITE) == 0)
 		newpde &= ~(PG_RW | PG_M);
 #if defined(PAE) || defined(PAE_TABLES)
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 #endif
 	if (newpde != oldpde) {
 		/*
 		 * As an optimization to future operations on this PDE, clear
 		 * PG_PROMOTED.  The impending invalidation will remove any
 		 * lingering 4KB page mappings from the TLB.
 		 */
 		if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED))
 			goto retry;
 		if ((oldpde & PG_G) != 0)
 			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 		else
 			anychanged = TRUE;
 	}
 	return (anychanged);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	boolean_t anychanged, pv_lists_locked;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 #if defined(PAE) || defined(PAE_TABLES)
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 #else
 	if (prot & VM_PROT_WRITE)
 		return;
 #endif
 
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	anychanged = FALSE;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pt_entry_t obits, pbits;
 		u_int pdirindex;
 
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we protecting the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_protect_pde().
 				 */
 				if (pmap_protect_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva, prot))
 					anychanged = TRUE;
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = TRUE;
 					if (!rw_try_wlock(&pvh_global_lock)) {
 						if (anychanged)
 							pmap_invalidate_all(
 							    pmap);
 						PMAP_UNLOCK(pmap);
 						goto resume;
 					}
 					sched_pin();
 				}
 				if (!pmap_demote_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva)) {
 					/*
 					 * The large page mapping was
 					 * destroyed.
 					 */
 					continue;
 				}
 			}
 		}
 
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			vm_page_t m;
 
 retry:
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits in
 			 * size, PG_RW, PG_A, and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 
 			if ((prot & VM_PROT_WRITE) == 0) {
 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 				    (PG_MANAGED | PG_M | PG_RW)) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				pbits &= ~(PG_RW | PG_M);
 			}
 #if defined(PAE) || defined(PAE_TABLES)
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 #endif
 
 			if (pbits != obits) {
 #if defined(PAE) || defined(PAE_TABLES)
 				if (!atomic_cmpset_64(pte, obits, pbits))
 					goto retry;
 #else
 				if (!atomic_cmpset_int((u_int *)pte, obits,
 				    pbits))
 					goto retry;
 #endif
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = TRUE;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
  * within a single page table page (PTP) to a single 2- or 4MB page mapping.
  * For promotion to occur, two conditions must be met: (1) the 4KB page
  * mappings must map aligned, contiguous physical memory and (2) the 4KB page
  * mappings must have identical characteristics.
  *
  * Managed (PG_MANAGED) mappings within the kernel address space are not
  * promoted.  The reason is that kernel PDEs are replicated in each pmap but
  * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
  * pmap.
  */
 static void
 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
 	vm_offset_t oldpteva;
 	vm_page_t mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
 	 * either invalid, unused, or does not map the first 4KB physical page
 	 * within a 2- or 4MB page.
 	 */
 	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
 setpde:
 	newpde = *firstpte;
 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 		pmap_pde_p_failures++;
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
 		pmap_pde_p_failures++;
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 		/*
 		 * When PG_M is already clear, PG_RW can be cleared without
 		 * a TLB invalidation.
 		 */
 		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
 		    ~PG_RW))  
 			goto setpde;
 		newpde &= ~PG_RW;
 	}
 
 	/* 
 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
 	 * PTE maps an unexpected 4KB physical page or does not have identical
 	 * characteristics to the first PTE.
 	 */
 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 setpte:
 		oldpte = *pte;
 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 			pmap_pde_p_failures++;
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 			/*
 			 * When PG_M is already clear, PG_RW can be cleared
 			 * without a TLB invalidation.
 			 */
 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
 			    oldpte & ~PG_RW))
 				goto setpte;
 			oldpte &= ~PG_RW;
 			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
 			    (va & ~PDRMASK);
 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
 			    " in pmap %p", oldpteva, pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 			pmap_pde_p_failures++;
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the PDE
 	 * mapping the superpage is demoted by pmap_demote_pde() or
 	 * destroyed by pmap_remove_pde(). 
 	 */
 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_pde: page table page is out of range"));
 	KASSERT(mpte->pindex == va >> PDRSHIFT,
 	    ("pmap_promote_pde: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, mpte)) {
 		pmap_pde_p_failures++;
 		CTR2(KTR_PMAP,
 		    "pmap_promote_pde: failure for va %#x in pmap %p", va,
 		    pmap);
 		return;
 	}
 
 	/*
 	 * Promote the pv entries.
 	 */
 	if ((newpde & PG_MANAGED) != 0)
 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
 
 	/*
 	 * Propagate the PAT index to its proper position.
 	 */
 	if ((newpde & PG_PTE_PAT) != 0)
 		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
 
 	/*
 	 * Map the superpage.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 	else if (pmap == kernel_pmap)
 		pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde);
 	else
 		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
 
 	pmap_pde_promotions++;
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
 	    " in pmap %p", va, pmap);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	pt_entry_t newpte, origpte;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	boolean_t invlva, wired;
 
 	va = trunc_page(va);
 	mpte = NULL;
 	wired = (flags & PMAP_ENTER_WIRED) != 0;
 
-	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
-	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
-	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
+	KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) ||
+	    (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS),
+	    ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va));
+	KASSERT(va < PMAP_TRM_MIN_ADDRESS,
+	    ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)",
 	    va));
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 
 	pde = pmap_pde(pmap, va);
-	if (va < VM_MAXUSER_ADDRESS) {
+	if (pmap != kernel_pmap) {
 		/*
 		 * va is for UVA.
 		 * In the case that a page table page is not resident,
 		 * we are creating it here.  pmap_allocpte() handles
 		 * demotion.
 		 */
 		mpte = pmap_allocpte(pmap, va, flags);
 		if (mpte == NULL) {
 			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
 			    ("pmap_allocpte failed with sleep allowed"));
 			sched_unpin();
 			rw_wunlock(&pvh_global_lock);
 			PMAP_UNLOCK(pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 	} else {
 		/*
 		 * va is for KVA, so pmap_demote_pde() will never fail
 		 * to install a page table page.  PG_V is also
 		 * asserted by pmap_demote_pde().
 		 */
 		KASSERT(pde != NULL && (*pde & PG_V) != 0,
 		    ("KVA %#x invalid pde pdir %#jx", va,
 		    (uintmax_t)pmap->pm_pdir[PTDPTDI]));
 		if ((*pde & PG_PS) != 0)
 			pmap_demote_pde(pmap, pde, va);
 	}
 	pte = pmap_pte_quick(pmap, va);
 
 	/*
 	 * Page Directory table entry is not valid, which should not
 	 * happen.  We should have either allocated the page table
 	 * page or demoted the existing mapping above.
 	 */
 	if (pte == NULL) {
 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
 		    (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
 	}
 
 	pa = VM_PAGE_TO_PHYS(m);
 	om = NULL;
 	origpte = *pte;
 	opa = origpte & PG_FRAME;
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove extra pte reference
 		 */
 		if (mpte)
 			mpte->wire_count--;
 
 		if (origpte & PG_MANAGED) {
 			om = m;
 			pa |= PG_MANAGED;
 		}
 		goto validate;
 	} 
 
 	pv = NULL;
 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		if (origpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (origpte & PG_MANAGED) {
 			om = PHYS_TO_VM_PAGE(opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 		}
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%x", va));
 		}
 	} else
 		pmap->pm_stats.resident_count++;
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
-		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
+		KASSERT(pmap != kernel_pmap || va < kmi.clean_sva ||
+		    va >= kmi.clean_eva,
 		    ("pmap_enter: managed mapping within the clean submap"));
 		if (pv == NULL)
 			pv = get_pv_entry(pmap, FALSE);
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		pa |= PG_MANAGED;
 	} else if (pv != NULL)
 		free_pv_entry(pmap, pv);
 
 	/*
 	 * Increment counters
 	 */
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
 	if ((prot & VM_PROT_WRITE) != 0) {
 		newpte |= PG_RW;
 		if ((newpte & PG_MANAGED) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 #if defined(PAE) || defined(PAE_TABLES)
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 #endif
 	if (wired)
 		newpte |= PG_W;
-	if (va < VM_MAXUSER_ADDRESS)
+	if (pmap != kernel_pmap)
 		newpte |= PG_U;
-	if (pmap == kernel_pmap)
-		newpte |= pgeflag;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		newpte |= PG_A;
 		if ((flags & VM_PROT_WRITE) != 0)
 			newpte |= PG_M;
 		if (origpte & PG_V) {
 			invlva = FALSE;
 			origpte = pte_load_store(pte, newpte);
 			if (origpte & PG_A) {
 				if (origpte & PG_MANAGED)
 					vm_page_aflag_set(om, PGA_REFERENCED);
 				if (opa != VM_PAGE_TO_PHYS(m))
 					invlva = TRUE;
 #if defined(PAE) || defined(PAE_TABLES)
 				if ((origpte & PG_NX) == 0 &&
 				    (newpte & PG_NX) != 0)
 					invlva = TRUE;
 #endif
 			}
 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 				if ((origpte & PG_MANAGED) != 0)
 					vm_page_dirty(om);
 				if ((prot & VM_PROT_WRITE) == 0)
 					invlva = TRUE;
 			}
 			if ((origpte & PG_MANAGED) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 			if (invlva)
 				pmap_invalidate_page(pmap, va);
 		} else
 			pte_store(pte, newpte);
 	}
 
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * If both the page table page and the reservation are fully
 	 * populated, then attempt promotion.
 	 */
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_pde(pmap, pde, va);
 #endif
 
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	return (KERN_SUCCESS);
 }
 
 /*
  * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
  * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
  * blocking, (2) a mapping already exists at the specified virtual address, or
  * (3) a pv entry cannot be allocated without reclaiming another pv entry. 
  */
 static boolean_t
 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	pd_entry_t *pde, newpde;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pde = pmap_pde(pmap, va);
 	if (*pde != 0) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
 	}
 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
 	    PG_PS | PG_V;
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		newpde |= PG_MANAGED;
 
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (FALSE);
 		}
 	}
 #if defined(PAE) || defined(PAE_TABLES)
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 #endif
 	if (va < VM_MAXUSER_ADDRESS)
 		newpde |= PG_U;
 
 	/*
 	 * Increment counters.
 	 */
 	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
 
 	/*
 	 * Map the superpage.  (This is not a promoted mapping; there will not
 	 * be any lingering 4KB page mappings in the TLB.)
 	 */
 	pde_store(pde, newpde);
 
 	pmap_pde_mappings++;
 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 		    m->psind == 1 && pg_ps_enabled &&
 		    pmap_enter_pde(pmap, va, m, prot))
 			m = &m[NBPDR / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 			    mpte);
 		m = TAILQ_NEXT(m, listq);
 	}
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte)
 {
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 	struct spglist free;
 
-	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
-	    (m->oflags & VPO_UNMANAGED) != 0,
+	KASSERT(pmap != kernel_pmap || va < kmi.clean_sva ||
+	    va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
-	if (va < VM_MAXUSER_ADDRESS) {
+	if (pmap != kernel_pmap) {
 		u_int ptepindex;
 		pd_entry_t ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = va >> PDRSHIFT;
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap->pm_pdir[ptepindex];
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa) {
 				if (ptepa & PG_PS)
 					return (NULL);
 				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex,
 				    PMAP_ENTER_NOSLEEP);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
-	/*
-	 * This call to vtopte makes the assumption that we are
-	 * entering the page into the current pmap.  In order to support
-	 * quick entry into any pmap, one would likely use pmap_pte_quick.
-	 * But that isn't as quick as vtopte.
-	 */
-	pte = vtopte(va);
+	/* XXXKIB: pmap_pte_quick() instead ? */
+	pte = pmap_pte(pmap, va);
 	if (*pte) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
+		pmap_pte_release(pte);
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			
 			mpte = NULL;
 		}
+		pmap_pte_release(pte);
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
 #if defined(PAE) || defined(PAE_TABLES)
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		pa |= pg_nx;
 #endif
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
+	pmap_pte_release(pte);
 	return (mpte);
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 	pd_entry_t *pde;
 	vm_paddr_t pa, ptepa;
 	vm_page_t p;
 	int pat_mode;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 	if (pseflag && 
 	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
 			return;
 		p = vm_page_lookup(object, pindex);
 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
 		    ("pmap_object_init_pt: invalid page %p", p));
 		pat_mode = p->md.pat_mode;
 
 		/*
 		 * Abort the mapping if the first page is not physically
 		 * aligned to a 2/4MB page boundary.
 		 */
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		/*
 		 * Skip the first page.  Abort the mapping if the rest of
 		 * the pages are not physically contiguous or have differing
 		 * memory attributes.
 		 */
 		p = TAILQ_NEXT(p, listq);
 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 		    pa += PAGE_SIZE) {
 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_object_init_pt: invalid page %p", p));
 			if (pa != VM_PAGE_TO_PHYS(p) ||
 			    pat_mode != p->md.pat_mode)
 				return;
 			p = TAILQ_NEXT(p, listq);
 		}
 
 		/*
 		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
 		 * "size" is a multiple of 2/4M, adding the PAT setting to
 		 * "pa" will not affect the termination of this loop.
 		 */
 		PMAP_LOCK(pmap);
 		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
 		    size; pa += NBPDR) {
 			pde = pmap_pde(pmap, addr);
 			if (*pde == 0) {
 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap->pm_stats.resident_count += NBPDR /
 				    PAGE_SIZE;
 				pmap_pde_mappings++;
 			}
 			/* Else continue on if the PDE is already valid. */
 			addr += NBPDR;
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware feature,
  *	so there is no need to invalidate any TLB entries.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	boolean_t pv_lists_locked;
 
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 		pde = pmap_pde(pmap, sva);
 		if ((*pde & PG_V) == 0)
 			continue;
 		if ((*pde & PG_PS) != 0) {
 			if ((*pde & PG_W) == 0)
 				panic("pmap_unwire: pde %#jx is missing PG_W",
 				    (uintmax_t)*pde);
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 				/*
 				 * Regardless of whether a pde (or pte) is 32
 				 * or 64 bits in size, PG_W is among the least
 				 * significant 32 bits.
 				 */
 				atomic_clear_int((u_int *)pde, PG_W);
 				pmap->pm_stats.wired_count -= NBPDR /
 				    PAGE_SIZE;
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = TRUE;
 					if (!rw_try_wlock(&pvh_global_lock)) {
 						PMAP_UNLOCK(pmap);
 						/* Repeat sva. */
 						goto resume;
 					}
 					sched_pin();
 				}
 				if (!pmap_demote_pde(pmap, pde, sva))
 					panic("pmap_unwire: demotion failed");
 			}
 		}
 		if (pdnxt > eva)
 			pdnxt = eva;
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & PG_V) == 0)
 				continue;
 			if ((*pte & PG_W) == 0)
 				panic("pmap_unwire: pte %#jx is missing PG_W",
 				    (uintmax_t)*pte);
 
 			/*
 			 * PG_W must be cleared atomically.  Although the pmap
 			 * lock synchronizes access to PG_W, another processor
 			 * could be setting PG_M and/or PG_A concurrently.
 			 *
 			 * PG_W is among the least significant 32 bits.
 			 */
 			atomic_clear_int((u_int *)pte, PG_W);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
- *	This routine is only advisory and need not do anything.
+ *	This routine is only advisory and need not do anything.  Since
+ *	current pmap is always the kernel pmap when executing in
+ *	kernel, and we do not copy from the kernel pmap to a user
+ *	pmap, this optimization is not usable in 4/4G full split i386
+ *	world.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
-	struct spglist free;
-	vm_offset_t addr;
-	vm_offset_t end_addr = src_addr + len;
-	vm_offset_t pdnxt;
+}
 
-	if (dst_addr != src_addr)
-		return;
-
-	if (!pmap_is_current(src_pmap))
-		return;
-
-	rw_wlock(&pvh_global_lock);
-	if (dst_pmap < src_pmap) {
-		PMAP_LOCK(dst_pmap);
-		PMAP_LOCK(src_pmap);
-	} else {
-		PMAP_LOCK(src_pmap);
-		PMAP_LOCK(dst_pmap);
-	}
-	sched_pin();
-	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
-		pt_entry_t *src_pte, *dst_pte;
-		vm_page_t dstmpte, srcmpte;
-		pd_entry_t srcptepaddr;
-		u_int ptepindex;
-
-		KASSERT(addr < UPT_MIN_ADDRESS,
-		    ("pmap_copy: invalid to pmap_copy page tables"));
-
-		pdnxt = (addr + NBPDR) & ~PDRMASK;
-		if (pdnxt < addr)
-			pdnxt = end_addr;
-		ptepindex = addr >> PDRSHIFT;
-
-		srcptepaddr = src_pmap->pm_pdir[ptepindex];
-		if (srcptepaddr == 0)
-			continue;
-			
-		if (srcptepaddr & PG_PS) {
-			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
-				continue;
-			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
-			    ((srcptepaddr & PG_MANAGED) == 0 ||
-			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
-			    PG_PS_FRAME))) {
-				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
-				    ~PG_W;
-				dst_pmap->pm_stats.resident_count +=
-				    NBPDR / PAGE_SIZE;
-				pmap_pde_mappings++;
-			}
-			continue;
-		}
-
-		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
-		KASSERT(srcmpte->wire_count > 0,
-		    ("pmap_copy: source page table page is unused"));
-
-		if (pdnxt > end_addr)
-			pdnxt = end_addr;
-
-		src_pte = vtopte(addr);
-		while (addr < pdnxt) {
-			pt_entry_t ptetemp;
-			ptetemp = *src_pte;
-			/*
-			 * we only virtual copy managed pages
-			 */
-			if ((ptetemp & PG_MANAGED) != 0) {
-				dstmpte = pmap_allocpte(dst_pmap, addr,
-				    PMAP_ENTER_NOSLEEP);
-				if (dstmpte == NULL)
-					goto out;
-				dst_pte = pmap_pte_quick(dst_pmap, addr);
-				if (*dst_pte == 0 &&
-				    pmap_try_insert_pv_entry(dst_pmap, addr,
-				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
-					/*
-					 * Clear the wired, modified, and
-					 * accessed (referenced) bits
-					 * during the copy.
-					 */
-					*dst_pte = ptetemp & ~(PG_W | PG_M |
-					    PG_A);
-					dst_pmap->pm_stats.resident_count++;
-	 			} else {
-					SLIST_INIT(&free);
-					if (pmap_unwire_ptp(dst_pmap, dstmpte,
-					    &free)) {
-						pmap_invalidate_page(dst_pmap,
-						    addr);
-						vm_page_free_pages_toq(&free,
-						    true);
-					}
-					goto out;
-				}
-				if (dstmpte->wire_count >= srcmpte->wire_count)
-					break;
-			}
-			addr += PAGE_SIZE;
-			src_pte++;
-		}
-	}
-out:
-	sched_unpin();
-	rw_wunlock(&pvh_global_lock);
-	PMAP_UNLOCK(src_pmap);
-	PMAP_UNLOCK(dst_pmap);
-}	
-
 /*
  * Zero 1 page of virtual memory mapped from a hardware page by the caller.
  */
 static __inline void
 pagezero(void *page)
 {
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686) {
 		if (cpu_feature & CPUID_SSE2)
 			sse2_pagezero(page);
 		else
 			i686_pagezero(page);
 	} else
 #endif
 		bzero(page, PAGE_SIZE);
 }
 
 /*
  * Zero the specified hardware page.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	pt_entry_t *cmap_pte2;
 	struct pcpu *pc;
 
 	sched_pin();
 	pc = get_pcpu();
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte2)
 		panic("pmap_zero_page: CMAP2 busy");
 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 	    pmap_cache_bits(m->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr2);
 	pagezero(pc->pc_cmap_addr2);
 	*cmap_pte2 = 0;
 
 	/*
 	 * Unpin the thread before releasing the lock.  Otherwise the thread
 	 * could be rescheduled while still bound to the current CPU, only
 	 * to unpin itself immediately upon resuming execution.
 	 */
 	sched_unpin();
 	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 /*
  * Zero an an area within a single hardware page.  off and size must not
  * cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	pt_entry_t *cmap_pte2;
 	struct pcpu *pc;
 
 	sched_pin();
 	pc = get_pcpu();
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte2)
 		panic("pmap_zero_page_area: CMAP2 busy");
 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 	    pmap_cache_bits(m->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr2);
 	if (off == 0 && size == PAGE_SIZE) 
 		pagezero(pc->pc_cmap_addr2);
 	else
 		bzero(pc->pc_cmap_addr2 + off, size);
 	*cmap_pte2 = 0;
 	sched_unpin();
 	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 /*
  * Copy 1 specified hardware page to another.
  */
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 	pt_entry_t *cmap_pte1, *cmap_pte2;
 	struct pcpu *pc;
 
 	sched_pin();
 	pc = get_pcpu();
 	cmap_pte1 = pc->pc_cmap_pte1; 
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte1)
 		panic("pmap_copy_page: CMAP1 busy");
 	if (*cmap_pte2)
 		panic("pmap_copy_page: CMAP2 busy");
 	*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
 	    pmap_cache_bits(src->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr1);
 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
 	    pmap_cache_bits(dst->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr2);
 	bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE);
 	*cmap_pte1 = 0;
 	*cmap_pte2 = 0;
 	sched_unpin();
 	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	vm_page_t a_pg, b_pg;
 	char *a_cp, *b_cp;
 	vm_offset_t a_pg_offset, b_pg_offset;
 	pt_entry_t *cmap_pte1, *cmap_pte2;
 	struct pcpu *pc;
 	int cnt;
 
 	sched_pin();
 	pc = get_pcpu();
 	cmap_pte1 = pc->pc_cmap_pte1; 
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte1 != 0)
 		panic("pmap_copy_pages: CMAP1 busy");
 	if (*cmap_pte2 != 0)
 		panic("pmap_copy_pages: CMAP2 busy");
 	while (xfersize > 0) {
 		a_pg = ma[a_offset >> PAGE_SHIFT];
 		a_pg_offset = a_offset & PAGE_MASK;
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		b_pg = mb[b_offset >> PAGE_SHIFT];
 		b_pg_offset = b_offset & PAGE_MASK;
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
 		    pmap_cache_bits(a_pg->md.pat_mode, 0);
 		invlcaddr(pc->pc_cmap_addr1);
 		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
 		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
 		invlcaddr(pc->pc_cmap_addr2);
 		a_cp = pc->pc_cmap_addr1 + a_pg_offset;
 		b_cp = pc->pc_cmap_addr2 + b_pg_offset;
 		bcopy(a_cp, b_cp, cnt);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 	*cmap_pte1 = 0;
 	*cmap_pte2 = 0;
 	sched_unpin();
 	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	int count;
 
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
 	rw_wlock(&pvh_global_lock);
 	count = pmap_pvh_wired_mappings(&m->md, count);
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
 	        count);
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
 /*
  *	pmap_pvh_wired_mappings:
  *
  *	Return the updated number "count" of managed mappings that are wired.
  */
 static int
 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
 {
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & PG_W) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 	return (count);
 }
 
 /*
  * Returns TRUE if the given page is mapped individually or as part of
  * a 4mpage.  Otherwise, returns FALSE.
  */
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
 	rw_wlock(&pvh_global_lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m, mpte, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct spglist free;
 	int field, idx;
 	int32_t bit;
 	uint32_t inuse, bitmask;
 	int allfree;
 
 	if (pmap != PCPU_GET(curpmap)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 	SLIST_INIT(&free);
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
 		    pc->pc_pmap));
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfl(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 32 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = pmap_pde(pmap, pv->pv_va);
 				tpte = *pte;
 				if ((tpte & PG_PS) == 0) {
-					pte = vtopte(pv->pv_va);
+					pte = pmap_pte_quick(pmap, pv->pv_va);
 					tpte = *pte & ~PG_PTE_PAT;
 				}
 
 				if (tpte == 0) {
 					printf(
 					    "TPTE at %p  IS ZERO @ VA %08x\n",
 					    pte, pv->pv_va);
 					panic("bad pte");
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad tpte %#jx",
 				    (uintmax_t)tpte));
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 					if ((tpte & PG_PS) != 0) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 					} else
 						vm_page_dirty(m);
 				}
 
 				/* Mark free */
 				PV_STAT(pv_entry_frees++);
 				PV_STAT(pv_entry_spare++);
 				pv_entry_count--;
 				pc->pc_map[field] |= bitmask;
 				if ((tpte & PG_PS) != 0) {
 					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							if (TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 					if (mpte != NULL) {
 						pmap->pm_stats.resident_count--;
 						KASSERT(mpte->wire_count == NPTEPG,
 						    ("pmap_remove_pages: pte page wire count error"));
 						mpte->wire_count = 0;
 						pmap_add_delayed_free_list(mpte, &free, FALSE);
 					}
 				} else {
 					pmap->pm_stats.resident_count--;
 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 					if (TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m, PGA_WRITEABLE);
 					}
 					pmap_unuse_pt(pmap, pv->pv_va, &free);
 				}
 			}
 		}
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	sched_unpin();
 	pmap_invalidate_all(pmap);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_modified_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Returns TRUE if any of the given mappings were used to modify
  * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
  * mappings are supported.
  */
 static boolean_t
 pmap_is_modified_pvh(struct md_page *pvh)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pde = pmap_pde(pmap, addr);
 	if (*pde != 0 && (*pde & PG_PS) == 0) {
-		pte = vtopte(addr);
-		rv = *pte == 0;
+		pte = pmap_pte(pmap, addr);
+		if (pte != NULL)
+			rv = *pte == 0;
+		pmap_pte_release(pte);
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_referenced_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Returns TRUE if any of the given mappings were referenced and FALSE
  * otherwise.  Both page and 4mpage mappings are supported.
  */
 static boolean_t
 pmap_is_referenced_pvh(struct md_page *pvh)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t next_pv, pv;
 	pmap_t pmap;
 	pd_entry_t *pde;
 	pt_entry_t oldpte, *pte;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, va);
 		if ((*pde & PG_RW) != 0)
 			(void)pmap_demote_pde(pmap, pde, va);
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
 		    " a 4mpage in page %p's pv list", m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 retry:
 		oldpte = *pte;
 		if ((oldpte & PG_RW) != 0) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_RW and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
 			    oldpte & ~(PG_RW | PG_M)))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 	int rtval = 0;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	pa = VM_PAGE_TO_PHYS(m);
 	pvh = pa_to_pvh(pa);
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0 ||
 	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Although "*pde" is mapping a 2/4MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 		if ((*pde & PG_A) != 0) {
 			/*
 			 * Since this reference bit is shared by either 1024
 			 * or 512 4KB pages, it should not be cleared every
 			 * time it is tested.  Apply a simple "hash" function
 			 * on the physical page number, the virtual superpage
 			 * number, and the pmap address to select one 4KB page
 			 * out of the 1024 or 512 on which testing the
 			 * reference bit will result in clearing that bit.
 			 * This function is designed to avoid the selection of
 			 * the same 4KB page for every 2- or 4MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 			    (*pde & PG_W) == 0) {
 				atomic_clear_int((u_int *)pde, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 			}
 			rtval++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 		}
 		if (rtval >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
 		    m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if ((*pte & PG_A) != 0) {
 			atomic_clear_int((u_int *)pte, PG_A);
 			pmap_invalidate_page(pmap, pv->pv_va);
 			rtval++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
 	    PMAP_TS_REFERENCED_MAX);
 out:
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	return (rtval);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte;
 	vm_offset_t va, pdnxt;
 	vm_page_t m;
 	boolean_t anychanged, pv_lists_locked;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	anychanged = FALSE;
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 		pde = pmap_pde(pmap, sva);
 		oldpde = *pde;
 		if ((oldpde & PG_V) == 0)
 			continue;
 		else if ((oldpde & PG_PS) != 0) {
 			if ((oldpde & PG_MANAGED) == 0)
 				continue;
 			if (!pv_lists_locked) {
 				pv_lists_locked = TRUE;
 				if (!rw_try_wlock(&pvh_global_lock)) {
 					if (anychanged)
 						pmap_invalidate_all(pmap);
 					PMAP_UNLOCK(pmap);
 					goto resume;
 				}
 				sched_pin();
 			}
 			if (!pmap_demote_pde(pmap, pde, sva)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Since the underlying page
 			 * table page is fully populated, this removal never
 			 * frees a page table page.
 			 */
 			if ((oldpde & PG_W) == 0) {
 				pte = pmap_pte_quick(pmap, sva);
 				KASSERT((*pte & PG_V) != 0,
 				    ("pmap_advise: invalid PTE"));
 				pmap_remove_pte(pmap, pte, sva, NULL);
 				anychanged = TRUE;
 			}
 		}
 		if (pdnxt > eva)
 			pdnxt = eva;
 		va = pdnxt;
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 				goto maybe_invlrng;
 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				atomic_clear_int((u_int *)pte, PG_M | PG_A);
 			} else if ((*pte & PG_A) != 0)
 				atomic_clear_int((u_int *)pte, PG_A);
 			else
 				goto maybe_invlrng;
 			if ((*pte & PG_G) != 0) {
 				if (va == pdnxt)
 					va = sva;
 			} else
 				anychanged = TRUE;
 			continue;
 maybe_invlrng:
 			if (va != pdnxt) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = pdnxt;
 			}
 		}
 		if (va != pdnxt)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t next_pv, pv;
 	pmap_t pmap;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t oldpte, *pte;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 	 * If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, va);
 		oldpde = *pde;
 		if ((oldpde & PG_RW) != 0) {
 			if (pmap_demote_pde(pmap, pde, va)) {
 				if ((oldpde & PG_W) == 0) {
 					/*
 					 * Write protect the mapping to a
 					 * single page so that a subsequent
 					 * write access may repromote.
 					 */
 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
 					    PG_PS_FRAME);
 					pte = pmap_pte_quick(pmap, va);
 					oldpte = *pte;
 					if ((oldpte & PG_V) != 0) {
 						/*
 						 * Regardless of whether a pte is 32 or 64 bits
 						 * in size, PG_RW and PG_M are among the least
 						 * significant 32 bits.
 						 */
 						while (!atomic_cmpset_int((u_int *)pte,
 						    oldpte,
 						    oldpte & ~(PG_M | PG_RW)))
 							oldpte = *pte;
 						vm_page_dirty(m);
 						pmap_invalidate_page(pmap, va);
 					}
 				}
 			}
 		}
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 		    " a 4mpage in page %p's pv list", m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_M is among the least significant
 			 * 32 bits. 
 			 */
 			atomic_clear_int((u_int *)pte, PG_M);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 static __inline void
 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
 {
 	u_int opte, npte;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PTE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opte = *(u_int *)pte;
 		npte = opte & ~PG_PTE_CACHE;
 		npte |= cache_bits;
 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 }
 
 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
 static __inline void
 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
 {
 	u_int opde, npde;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PDE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opde = *(u_int *)pde;
 		npde = opde & ~PG_PDE_CACHE;
 		npde |= cache_bits;
 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	vm_size_t tmpsize;
 	int i;
 
 	offset = pa & PAGE_MASK;
 	size = round_page(offset + size);
 	pa = pa & PG_FRAME;
 
-	if (pa < KERNLOAD && pa + size <= KERNLOAD)
-		va = KERNBASE + pa;
+	if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW)
+		va = pa + PMAP_MAP_LOW;
 	else if (!pmap_initialized) {
 		va = 0;
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->va == 0) {
 				ppim->pa = pa;
 				ppim->sz = size;
 				ppim->mode = mode;
 				ppim->va = virtual_avail;
 				virtual_avail += size;
 				va = ppim->va;
 				break;
 			}
 		}
 		if (va == 0)
 			panic("%s: too many preinit mappings", __func__);
 	} else {
 		/*
 		 * If we have a preinit mapping, re-use it.
 		 */
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->pa == pa && ppim->sz == size &&
 			    ppim->mode == mode)
 				return ((void *)(ppim->va + offset));
 		}
 		va = kva_alloc(size);
 		if (va == 0)
 			panic("%s: Couldn't allocate KVA", __func__);
 	}
 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 	pmap_invalidate_cache_range(va, va + size, FALSE);
 	return ((void *)(va + offset));
 }
 
 void *
 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t offset;
 	int i;
 
-	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
+	if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE)
 		return;
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 	va = trunc_page(va);
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == va && ppim->sz == size) {
 			if (pmap_initialized)
 				return;
 			ppim->pa = 0;
 			ppim->va = 0;
 			ppim->sz = 0;
 			ppim->mode = 0;
 			if (va + size == virtual_avail)
 				virtual_avail = va;
 			return;
 		}
 	}
 	if (pmap_initialized)
 		kva_free(va, size);
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pat_mode = ma;
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 
 	/*
 	 * If "m" is a normal page, flush it from the cache.
 	 * See pmap_invalidate_cache_range().
 	 *
 	 * First, try to find an existing mapping of the page by sf
 	 * buffer. sf_buf_invalidate_cache() modifies mapping and
 	 * flushes the cache.
 	 */    
 	if (sf_buf_invalidate_cache(m))
 		return;
 
 	/*
 	 * If page is not mapped by sf buffer, but CPU does not
 	 * support self snoop, map the page transient and do
 	 * invalidation. In the worst case, whole cache is flushed by
 	 * pmap_invalidate_cache_range().
 	 */
 	if ((cpu_feature & CPUID_SS) == 0)
 		pmap_flush_page(m);
 }
 
 static void
 pmap_flush_page(vm_page_t m)
 {
 	pt_entry_t *cmap_pte2;
 	struct pcpu *pc;
 	vm_offset_t sva, eva;
 	bool useclflushopt;
 
 	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 	if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) {
 		sched_pin();
 		pc = get_pcpu();
 		cmap_pte2 = pc->pc_cmap_pte2; 
 		mtx_lock(&pc->pc_cmap_lock);
 		if (*cmap_pte2)
 			panic("pmap_flush_page: CMAP2 busy");
 		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
 		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
 		invlcaddr(pc->pc_cmap_addr2);
 		sva = (vm_offset_t)pc->pc_cmap_addr2;
 		eva = sva + PAGE_SIZE;
 
 		/*
 		 * Use mfence or sfence despite the ordering implied by
 		 * mtx_{un,}lock() because clflush on non-Intel CPUs
 		 * and clflushopt are not guaranteed to be ordered by
 		 * any other instruction.
 		 */
 		if (useclflushopt)
 			sfence();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (; sva < eva; sva += cpu_clflush_line_size) {
 			if (useclflushopt)
 				clflushopt(sva);
 			else
 				clflush(sva);
 		}
 		if (useclflushopt)
 			sfence();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		*cmap_pte2 = 0;
 		sched_unpin();
 		mtx_unlock(&pc->pc_cmap_lock);
 	} else
 		pmap_invalidate_cache();
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the kernel map.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.
  */
 int
 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 {
 	vm_offset_t base, offset, tmpva;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	int cache_bits_pte, cache_bits_pde;
 	boolean_t changed;
 
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	/*
 	 * Only supported on kernel virtual addresses above the recursive map.
 	 */
 	if (base < VM_MIN_KERNEL_ADDRESS)
 		return (EINVAL);
 
 	cache_bits_pde = pmap_cache_bits(mode, 1);
 	cache_bits_pte = pmap_cache_bits(mode, 0);
 	changed = FALSE;
 
 	/*
 	 * Pages that aren't mapped aren't supported.  Also break down
 	 * 2/4MB pages into 4KB pages if required.
 	 */
 	PMAP_LOCK(kernel_pmap);
 	for (tmpva = base; tmpva < base + size; ) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde == 0) {
 			PMAP_UNLOCK(kernel_pmap);
 			return (EINVAL);
 		}
 		if (*pde & PG_PS) {
 			/*
 			 * If the current 2/4MB page already has
 			 * the required memory type, then we need not
 			 * demote this page.  Just increment tmpva to
 			 * the next 2/4MB page frame.
 			 */
 			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_4mpage(tmpva) + NBPDR;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 2/4MB
 			 * page frame and there is at least 2/4MB left
 			 * within the range, then we need not break
 			 * down this page into 4KB pages.
 			 */
 			if ((tmpva & PDRMASK) == 0 &&
 			    tmpva + PDRMASK < base + size) {
 				tmpva += NBPDR;
 				continue;
 			}
 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
 				PMAP_UNLOCK(kernel_pmap);
 				return (ENOMEM);
 			}
 		}
 		pte = vtopte(tmpva);
 		if (*pte == 0) {
 			PMAP_UNLOCK(kernel_pmap);
 			return (EINVAL);
 		}
 		tmpva += PAGE_SIZE;
 	}
 	PMAP_UNLOCK(kernel_pmap);
 
 	/*
 	 * Ok, all the pages exist, so run through them updating their
 	 * cache mode if required.
 	 */
 	for (tmpva = base; tmpva < base + size; ) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde & PG_PS) {
 			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pde, cache_bits_pde);
 				changed = TRUE;
 			}
 			tmpva = trunc_4mpage(tmpva) + NBPDR;
 		} else {
 			pte = vtopte(tmpva);
 			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
 				pmap_pte_attr(pte, cache_bits_pte);
 				changed = TRUE;
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Flush CPU caches to make sure any data isn't cached that
 	 * shouldn't be, etc.
 	 */
 	if (changed) {
 		pmap_invalidate_range(kernel_pmap, base, tmpva);
 		pmap_invalidate_cache_range(base, tmpva, FALSE);
 	}
 	return (0);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pd_entry_t *pdep;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	int val;
 
 	PMAP_LOCK(pmap);
 retry:
 	pdep = pmap_pde(pmap, addr);
 	if (*pdep != 0) {
 		if (*pdep & PG_PS) {
 			pte = *pdep;
 			/* Compute the physical address of the 4KB page. */
 			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
 			    PG_FRAME;
 			val = MINCORE_SUPER;
 		} else {
 			ptep = pmap_pte(pmap, addr);
 			pte = *ptep;
 			pmap_pte_release(ptep);
 			pa = pte & PG_FRAME;
 			val = 0;
 		}
 	} else {
 		pte = 0;
 		pa = 0;
 		val = 0;
 	}
 	if ((pte & PG_V) != 0) {
 		val |= MINCORE_INCORE;
 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((pte & PG_A) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t	pmap, oldpmap;
 	u_int	cpuid;
 	u_int32_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 	cpuid = PCPU_GET(cpuid);
 #if defined(SMP)
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_CLR(cpuid, &oldpmap->pm_active);
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 #if defined(PAE) || defined(PAE_TABLES)
 	cr3 = vtophys(pmap->pm_pdpt);
 #else
 	cr3 = vtophys(pmap->pm_pdir);
 #endif
 	/*
 	 * pmap_activate is for the current thread on the current cpu
 	 */
 	td->td_pcb->pcb_cr3 = cr3;
-	load_cr3(cr3);
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
 }
 
 void
 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 {
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < NBPDR)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & PDRMASK;
 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 	    (*addr & PDRMASK) == superpage_offset)
 		return;
 	if ((*addr & PDRMASK) < superpage_offset)
 		*addr = (*addr & ~PDRMASK) + superpage_offset;
 	else
 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 	vm_offset_t qaddr;
 	pt_entry_t *pte;
 
 	critical_enter();
 	qaddr = PCPU_GET(qmap_addr);
 	pte = vtopte(qaddr);
 
 	KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
 	*pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 	    pmap_cache_bits(pmap_page_get_memattr(m), 0);
 	invlpg(qaddr);
 
 	return (qaddr);
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 	vm_offset_t qaddr;
 	pt_entry_t *pte;
 
 	qaddr = PCPU_GET(qmap_addr);
 	pte = vtopte(qaddr);
 
 	KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
 	KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address"));
 
 	*pte = 0;
 	critical_exit();
+}
+
+static vmem_t *pmap_trm_arena;
+static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS;
+static int trm_guard = PAGE_SIZE;
+
+static int
+pmap_trm_import(void *unused __unused, vmem_size_t size, int flags,
+    vmem_addr_t *addrp)
+{
+	vm_page_t m;
+	vmem_addr_t af, addr, prev_addr;
+	pt_entry_t *trm_pte;
+
+	prev_addr = atomic_load_long(&pmap_trm_arena_last);
+	size = round_page(size) + trm_guard;
+	for (;;) {
+		if (prev_addr + size < prev_addr || prev_addr + size < size ||
+		    prev_addr + size > PMAP_TRM_MAX_ADDRESS)
+			return (ENOMEM);
+		addr = prev_addr + size;
+		if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr))
+			break;
+	}
+	prev_addr += trm_guard;
+	trm_pte = PTmap + atop(prev_addr);
+	for (af = prev_addr; af < addr; af += PAGE_SIZE) {
+		m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
+		    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
+		pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) |
+		    PG_M | PG_A | PG_RW | PG_V | pgeflag |
+		    pmap_cache_bits(VM_MEMATTR_DEFAULT, FALSE));
+	}
+	*addrp = prev_addr;
+	return (0);
+}
+
+static
+void pmap_init_trm(void)
+{
+	vm_page_t pd_m;
+
+	TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard);
+	if ((trm_guard & PAGE_MASK) != 0)
+		trm_guard = 0;
+	pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK);
+	vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE);
+	pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
+	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO);
+	if ((pd_m->flags & PG_ZERO) == 0)
+		pmap_zero_page(pd_m);
+	PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V |
+	    pmap_cache_bits(VM_MEMATTR_DEFAULT, TRUE);
+}
+
+void *
+pmap_trm_alloc(size_t size, int flags)
+{
+	vmem_addr_t res;
+	int error;
+
+	MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0);
+	error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int),
+	    0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res);
+	if (error != 0)
+		return (NULL);
+	return ((void *)res);
+}
+
+void
+pmap_trm_free(void *addr, size_t size)
+{
+
+	vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4));
 }
 
 #if defined(PMAP_DEBUG)
 pmap_pid_dump(int pid)
 {
 	pmap_t pmap;
 	struct proc *p;
 	int npte = 0;
 	int index;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_pid != pid)
 			continue;
 
 		if (p->p_vmspace) {
 			int i,j;
 			index = 0;
 			pmap = vmspace_pmap(p->p_vmspace);
 			for (i = 0; i < NPDEPTD; i++) {
 				pd_entry_t *pde;
 				pt_entry_t *pte;
 				vm_offset_t base = i << PDRSHIFT;
 				
 				pde = &pmap->pm_pdir[i];
 				if (pde && pmap_pde_v(pde)) {
 					for (j = 0; j < NPTEPG; j++) {
 						vm_offset_t va = base + (j << PAGE_SHIFT);
 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
 							if (index) {
 								index = 0;
 								printf("\n");
 							}
 							sx_sunlock(&allproc_lock);
 							return (npte);
 						}
 						pte = pmap_pte(pmap, va);
 						if (pte && pmap_pte_v(pte)) {
 							pt_entry_t pa;
 							vm_page_t m;
 							pa = *pte;
 							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
 								va, pa, m->hold_count, m->wire_count, m->flags);
 							npte++;
 							index++;
 							if (index >= 2) {
 								index = 0;
 								printf("\n");
 							} else {
 								printf(" ");
 							}
 						}
 					}
 				}
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return (npte);
 }
 #endif
Index: head/sys/i386/i386/sigtramp.s
===================================================================
--- head/sys/i386/i386/sigtramp.s	(revision 332488)
+++ head/sys/i386/i386/sigtramp.s	(revision 332489)
@@ -1,116 +1,138 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)locore.s	7.3 (Berkeley) 5/13/91
  * $FreeBSD$
  *
  *		originally from: locore.s, by William F. Jolitz
  *
  *		Substantially rewritten by David Greenman, Rod Grimes,
  *			Bruce Evans, Wolfgang Solfrank, Poul-Henning Kamp
  *			and many others.
  */
 
 #include <sys/syscall.h>
 #include <machine/asmacros.h>
 #include <machine/psl.h>
 
 #include "assym.inc"
 
 /*
  * Signal trampoline, copied to top of user stack
  */
 NON_GPROF_ENTRY(sigcode)
 	calll	*SIGF_HANDLER(%esp)
 	leal	SIGF_UC(%esp),%eax	/* get ucontext */
 	pushl	%eax
 	testl	$PSL_VM,UC_EFLAGS(%eax)
 	jne	1f
 	mov	UC_GS(%eax),%gs		/* restore %gs */
 1:
 	movl	$SYS_sigreturn,%eax
 	pushl	%eax			/* junk to fake return addr. */
 	int	$0x80			/* enter kernel with args */
 					/* on stack */
 1:
 	jmp	1b
 
 #ifdef COMPAT_FREEBSD4
 	ALIGN_TEXT
 freebsd4_sigcode:
 	calll	*SIGF_HANDLER(%esp)
 	leal	SIGF_UC4(%esp),%eax	/* get ucontext */
 	pushl	%eax
 	testl	$PSL_VM,UC4_EFLAGS(%eax)
 	jne	1f
 	mov	UC4_GS(%eax),%gs	/* restore %gs */
 1:
 	movl	$344,%eax		/* 4.x SYS_sigreturn */
 	pushl	%eax			/* junk to fake return addr. */
 	int	$0x80			/* enter kernel with args */
 					/* on stack */
 1:
 	jmp	1b
 #endif
 
 #ifdef COMPAT_43
 	ALIGN_TEXT
 osigcode:
 	call	*SIGF_HANDLER(%esp)	/* call signal handler */
 	lea	SIGF_SC(%esp),%eax	/* get sigcontext */
 	pushl	%eax
 	testl	$PSL_VM,SC_PS(%eax)
 	jne	9f
 	mov	SC_GS(%eax),%gs		/* restore %gs */
 9:
 	movl	$103,%eax		/* 3.x SYS_sigreturn */
 	pushl	%eax			/* junk to fake return addr. */
 	int	$0x80			/* enter kernel with args */
 0:	jmp	0b
+
+/*
+ * Our lcall $7,$0 handler remains in user mode (ring 3), since lcalls
+ * don't change the interrupt mask, so if this one went directly to the
+ * kernel then there would be a window with interrupts enabled in kernel
+ * mode, and all interrupt handlers would have to be almost as complicated
+ * as the NMI handler to support this.
+ *
+ * Instead, convert the lcall to an int0x80 call.  The kernel does most
+ * of the conversion by popping the lcall return values off the user
+ * stack and returning to them instead of to here, except when the
+ * conversion itself fails.  Adjusting the stack here is impossible for
+ * vfork() and harder for other syscalls.
+ */
+	ALIGN_TEXT
+lcall_tramp:
+	int	$0x80
+1:	jmp	1b
+
 #endif /* COMPAT_43 */
 
 	ALIGN_TEXT
 esigcode:
 
 	.data
 	.globl	szsigcode
 szsigcode:
 	.long	esigcode-sigcode
 #ifdef COMPAT_FREEBSD4
 	.globl	szfreebsd4_sigcode
 szfreebsd4_sigcode:
 	.long	esigcode-freebsd4_sigcode
 #endif
 #ifdef COMPAT_43
 	.globl	szosigcode
 szosigcode:
 	.long	esigcode-osigcode
+	.globl	sz_lcall_tramp
+sz_lcall_tramp:
+	.long	esigcode-lcall_tramp
 #endif
Index: head/sys/i386/i386/support.s
===================================================================
--- head/sys/i386/i386/support.s	(revision 332488)
+++ head/sys/i386/i386/support.s	(revision 332489)
@@ -1,837 +1,486 @@
 /*-
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
 #include <machine/pmap.h>
 #include <machine/specialreg.h>
 
 #include "assym.inc"
 
 #define IDXSHIFT	10
 
 	.text
 
 /*
  * bcopy family
  * void bzero(void *buf, u_int len)
  */
 ENTRY(bzero)
 	pushl	%edi
 	movl	8(%esp),%edi
 	movl	12(%esp),%ecx
 	xorl	%eax,%eax
 	shrl	$2,%ecx
 	rep
 	stosl
 	movl	12(%esp),%ecx
 	andl	$3,%ecx
 	rep
 	stosb
 	popl	%edi
 	ret
 END(bzero)
 
 ENTRY(sse2_pagezero)
 	pushl	%ebx
 	movl	8(%esp),%ecx
 	movl	%ecx,%eax
 	addl	$4096,%eax
 	xor	%ebx,%ebx
 	jmp	1f
 	/*
 	 * The loop takes 14 bytes.  Ensure that it doesn't cross a 16-byte
 	 * cache line.
 	 */
 	.p2align 4,0x90
 1:
 	movnti	%ebx,(%ecx)
 	movnti	%ebx,4(%ecx)
 	addl	$8,%ecx
 	cmpl	%ecx,%eax
 	jne	1b
 	sfence
 	popl	%ebx
 	ret
 END(sse2_pagezero)
 
 ENTRY(i686_pagezero)
 	pushl	%edi
 	pushl	%ebx
 
 	movl	12(%esp),%edi
 	movl	$1024,%ecx
 
 	ALIGN_TEXT
 1:
 	xorl	%eax,%eax
 	repe
 	scasl
 	jnz	2f
 
 	popl	%ebx
 	popl	%edi
 	ret
 
 	ALIGN_TEXT
 
 2:
 	incl	%ecx
 	subl	$4,%edi
 
 	movl	%ecx,%edx
 	cmpl	$16,%ecx
 
 	jge	3f
 
 	movl	%edi,%ebx
 	andl	$0x3f,%ebx
 	shrl	%ebx
 	shrl	%ebx
 	movl	$16,%ecx
 	subl	%ebx,%ecx
 
 3:
 	subl	%ecx,%edx
 	rep
 	stosl
 
 	movl	%edx,%ecx
 	testl	%edx,%edx
 	jnz	1b
 
 	popl	%ebx
 	popl	%edi
 	ret
 END(i686_pagezero)
 
 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
 	pushl	%edi
 	movl	8(%esp),%eax
 	movl	12(%esp),%edi
 	movl	16(%esp),%ecx
 	rep
 	stosw
 	popl	%edi
 	ret
 END(fillw)
 
 ENTRY(bcopyb)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
 	movl	16(%esp),%edi
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi			/* copy backwards. */
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	std
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	cld
 	ret
 END(bcopyb)
 
 /*
  * bcopy(src, dst, cnt)
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
 ENTRY(bcopy)
 	pushl	%ebp
 	movl	%esp,%ebp
 	pushl	%esi
 	pushl	%edi
 	movl	8(%ebp),%esi
 	movl	12(%ebp),%edi
 	movl	16(%ebp),%ecx
 
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	rep
 	movsl
 	movl	16(%ebp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	popl	%ebp
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi			/* copy backwards */
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	andl	$3,%ecx				/* any fractional bytes? */
 	std
 	rep
 	movsb
 	movl	16(%ebp),%ecx			/* copy remainder by 32-bit words */
 	shrl	$2,%ecx
 	subl	$3,%esi
 	subl	$3,%edi
 	rep
 	movsl
 	popl	%edi
 	popl	%esi
 	cld
 	popl	%ebp
 	ret
 END(bcopy)
 
 /*
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%esi
 	popl	%edi
 	ret
 END(memcpy)
 
-/*****************************************************************************/
-/* copyout and fubyte family                                                 */
-/*****************************************************************************/
 /*
- * Access user memory from inside the kernel. These routines and possibly
- * the math- and DOS emulators should be the only places that do this.
- *
- * We have to access the memory with user's permissions, so use a segment
- * selector with RPL 3. For writes to user space we have to additionally
- * check the PTE for write permission, because the 386 does not check
- * write permissions when we are executing with EPL 0. The 486 does check
- * this if the WP bit is set in CR0, so we can use a simpler version here.
- *
- * These routines set curpcb->pcb_onfault for the time they execute. When a
- * protection violation occurs inside the functions, the trap handler
- * returns to *curpcb->pcb_onfault instead of the function.
- */
-
-/*
- * copyout(from_kernel, to_user, len)  - MP SAFE
- */
-ENTRY(copyout)
-	movl	PCPU(CURPCB),%eax
-	movl	$copyout_fault,PCB_ONFAULT(%eax)
-	pushl	%esi
-	pushl	%edi
-	pushl	%ebx
-	movl	16(%esp),%esi
-	movl	20(%esp),%edi
-	movl	24(%esp),%ebx
-	testl	%ebx,%ebx			/* anything to do? */
-	jz	done_copyout
-
-	/*
-	 * Check explicitly for non-user addresses.  This check is essential
-	 * because it prevents usermode from writing into the kernel.  We do
-	 * not verify anywhere else that the user did not specify a rogue
-	 * address.
-	 */
-	/*
-	 * First, prevent address wrapping.
-	 */
-	movl	%edi,%eax
-	addl	%ebx,%eax
-	jc	copyout_fault
-/*
- * XXX STOP USING VM_MAXUSER_ADDRESS.
- * It is an end address, not a max, so every time it is used correctly it
- * looks like there is an off by one error, and of course it caused an off
- * by one error in several places.
- */
-	cmpl	$VM_MAXUSER_ADDRESS,%eax
-	ja	copyout_fault
-
-	/* bcopy(%esi, %edi, %ebx) */
-	movl	%ebx,%ecx
-
-	shrl	$2,%ecx
-	rep
-	movsl
-	movb	%bl,%cl
-	andb	$3,%cl
-	rep
-	movsb
-
-done_copyout:
-	popl	%ebx
-	popl	%edi
-	popl	%esi
-	xorl	%eax,%eax
-	movl	PCPU(CURPCB),%edx
-	movl	%eax,PCB_ONFAULT(%edx)
-	ret
-END(copyout)
-
-	ALIGN_TEXT
-copyout_fault:
-	popl	%ebx
-	popl	%edi
-	popl	%esi
-	movl	PCPU(CURPCB),%edx
-	movl	$0,PCB_ONFAULT(%edx)
-	movl	$EFAULT,%eax
-	ret
-
-/*
- * copyin(from_user, to_kernel, len) - MP SAFE
- */
-ENTRY(copyin)
-	movl	PCPU(CURPCB),%eax
-	movl	$copyin_fault,PCB_ONFAULT(%eax)
-	pushl	%esi
-	pushl	%edi
-	movl	12(%esp),%esi			/* caddr_t from */
-	movl	16(%esp),%edi			/* caddr_t to */
-	movl	20(%esp),%ecx			/* size_t  len */
-
-	/*
-	 * make sure address is valid
-	 */
-	movl	%esi,%edx
-	addl	%ecx,%edx
-	jc	copyin_fault
-	cmpl	$VM_MAXUSER_ADDRESS,%edx
-	ja	copyin_fault
-
-	movb	%cl,%al
-	shrl	$2,%ecx				/* copy longword-wise */
-	rep
-	movsl
-	movb	%al,%cl
-	andb	$3,%cl				/* copy remaining bytes */
-	rep
-	movsb
-
-	popl	%edi
-	popl	%esi
-	xorl	%eax,%eax
-	movl	PCPU(CURPCB),%edx
-	movl	%eax,PCB_ONFAULT(%edx)
-	ret
-END(copyin)
-
-	ALIGN_TEXT
-copyin_fault:
-	popl	%edi
-	popl	%esi
-	movl	PCPU(CURPCB),%edx
-	movl	$0,PCB_ONFAULT(%edx)
-	movl	$EFAULT,%eax
-	ret
-
-/*
- * casueword.  Compare and set user word.  Returns -1 on fault,
- * 0 on non-faulting access.  The current value is in *oldp.
- */
-ALTENTRY(casueword32)
-ENTRY(casueword)
-	movl	PCPU(CURPCB),%ecx
-	movl	$fusufault,PCB_ONFAULT(%ecx)
-	movl	4(%esp),%edx			/* dst */
-	movl	8(%esp),%eax			/* old */
-	movl	16(%esp),%ecx			/* new */
-
-	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
-	ja	fusufault
-
-#ifdef SMP
-	lock
-#endif
-	cmpxchgl %ecx,(%edx)			/* Compare and set. */
-
-	/*
-	 * The old value is in %eax.  If the store succeeded it will be the
-	 * value we expected (old) from before the store, otherwise it will
-	 * be the current value.
-	 */
-
-	movl	PCPU(CURPCB),%ecx
-	movl	$0,PCB_ONFAULT(%ecx)
-	movl	12(%esp),%edx			/* oldp */
-	movl	%eax,(%edx)
-	xorl	%eax,%eax
-	ret
-END(casueword32)
-END(casueword)
-
-/*
- * Fetch (load) a 32-bit word, a 16-bit word, or an 8-bit byte from user
- * memory.
- */
-
-ALTENTRY(fueword32)
-ENTRY(fueword)
-	movl	PCPU(CURPCB),%ecx
-	movl	$fusufault,PCB_ONFAULT(%ecx)
-	movl	4(%esp),%edx			/* from */
-
-	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
-	ja	fusufault
-
-	movl	(%edx),%eax
-	movl	$0,PCB_ONFAULT(%ecx)
-	movl	8(%esp),%edx
-	movl	%eax,(%edx)
-	xorl	%eax,%eax
-	ret
-END(fueword32)
-END(fueword)
-
-/*
  * fuswintr() and suswintr() are specialized variants of fuword16() and
  * suword16(), respectively.  They are called from the profiling code,
  * potentially at interrupt time.  If they fail, that's okay; good things
  * will happen later.  They always fail for now, until the trap code is
  * able to deal with this.
  */
 ALTENTRY(suswintr)
 ENTRY(fuswintr)
 	movl	$-1,%eax
 	ret
 END(suswintr)
 END(fuswintr)
-
-ENTRY(fuword16)
-	movl	PCPU(CURPCB),%ecx
-	movl	$fusufault,PCB_ONFAULT(%ecx)
-	movl	4(%esp),%edx
-
-	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
-	ja	fusufault
-
-	movzwl	(%edx),%eax
-	movl	$0,PCB_ONFAULT(%ecx)
-	ret
-END(fuword16)
-
-ENTRY(fubyte)
-	movl	PCPU(CURPCB),%ecx
-	movl	$fusufault,PCB_ONFAULT(%ecx)
-	movl	4(%esp),%edx
-
-	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
-	ja	fusufault
-
-	movzbl	(%edx),%eax
-	movl	$0,PCB_ONFAULT(%ecx)
-	ret
-END(fubyte)
-
-	ALIGN_TEXT
-fusufault:
-	movl	PCPU(CURPCB),%ecx
-	xorl	%eax,%eax
-	movl	%eax,PCB_ONFAULT(%ecx)
-	decl	%eax
-	ret
-
-/*
- * Store a 32-bit word, a 16-bit word, or an 8-bit byte to user memory.
- * All these functions are MPSAFE.
- */
-
-ALTENTRY(suword32)
-ENTRY(suword)
-	movl	PCPU(CURPCB),%ecx
-	movl	$fusufault,PCB_ONFAULT(%ecx)
-	movl	4(%esp),%edx
-
-	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
-	ja	fusufault
-
-	movl	8(%esp),%eax
-	movl	%eax,(%edx)
-	xorl	%eax,%eax
-	movl	PCPU(CURPCB),%ecx
-	movl	%eax,PCB_ONFAULT(%ecx)
-	ret
-END(suword32)
-END(suword)
-
-ENTRY(suword16)
-	movl	PCPU(CURPCB),%ecx
-	movl	$fusufault,PCB_ONFAULT(%ecx)
-	movl	4(%esp),%edx
-
-	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
-	ja	fusufault
-
-	movw	8(%esp),%ax
-	movw	%ax,(%edx)
-	xorl	%eax,%eax
-	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
-	movl	%eax,PCB_ONFAULT(%ecx)
-	ret
-END(suword16)
-
-ENTRY(subyte)
-	movl	PCPU(CURPCB),%ecx
-	movl	$fusufault,PCB_ONFAULT(%ecx)
-	movl	4(%esp),%edx
-
-	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
-	ja	fusufault
-
-	movb	8(%esp),%al
-	movb	%al,(%edx)
-	xorl	%eax,%eax
-	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
-	movl	%eax,PCB_ONFAULT(%ecx)
-	ret
-END(subyte)
-
-/*
- * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
- *
- *	copy a string from 'from' to 'to', stop when a 0 character is reached.
- *	return ENAMETOOLONG if string is longer than maxlen, and
- *	EFAULT on protection violations. If lencopied is non-zero,
- *	return the actual length in *lencopied.
- */
-ENTRY(copyinstr)
-	pushl	%esi
-	pushl	%edi
-	movl	PCPU(CURPCB),%ecx
-	movl	$cpystrflt,PCB_ONFAULT(%ecx)
-
-	movl	12(%esp),%esi			/* %esi = from */
-	movl	16(%esp),%edi			/* %edi = to */
-	movl	20(%esp),%edx			/* %edx = maxlen */
-
-	movl	$VM_MAXUSER_ADDRESS,%eax
-
-	/* make sure 'from' is within bounds */
-	subl	%esi,%eax
-	jbe	cpystrflt
-
-	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
-	cmpl	%edx,%eax
-	jae	1f
-	movl	%eax,%edx
-	movl	%eax,20(%esp)
-1:
-	incl	%edx
-
-2:
-	decl	%edx
-	jz	3f
-
-	lodsb
-	stosb
-	orb	%al,%al
-	jnz	2b
-
-	/* Success -- 0 byte reached */
-	decl	%edx
-	xorl	%eax,%eax
-	jmp	cpystrflt_x
-3:
-	/* edx is zero - return ENAMETOOLONG or EFAULT */
-	cmpl	$VM_MAXUSER_ADDRESS,%esi
-	jae	cpystrflt
-4:
-	movl	$ENAMETOOLONG,%eax
-	jmp	cpystrflt_x
-
-cpystrflt:
-	movl	$EFAULT,%eax
-
-cpystrflt_x:
-	/* set *lencopied and return %eax */
-	movl	PCPU(CURPCB),%ecx
-	movl	$0,PCB_ONFAULT(%ecx)
-	movl	20(%esp),%ecx
-	subl	%edx,%ecx
-	movl	24(%esp),%edx
-	testl	%edx,%edx
-	jz	1f
-	movl	%ecx,(%edx)
-1:
-	popl	%edi
-	popl	%esi
-	ret
-END(copyinstr)
 
 /*
  * copystr(from, to, maxlen, int *lencopied) - MP SAFE
  */
 ENTRY(copystr)
 	pushl	%esi
 	pushl	%edi
 
 	movl	12(%esp),%esi			/* %esi = from */
 	movl	16(%esp),%edi			/* %edi = to */
 	movl	20(%esp),%edx			/* %edx = maxlen */
 	incl	%edx
 1:
 	decl	%edx
 	jz	4f
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	1b
 
 	/* Success -- 0 byte reached */
 	decl	%edx
 	xorl	%eax,%eax
 	jmp	6f
 4:
 	/* edx is zero -- return ENAMETOOLONG */
 	movl	$ENAMETOOLONG,%eax
 
 6:
 	/* set *lencopied and return %eax */
 	movl	20(%esp),%ecx
 	subl	%edx,%ecx
 	movl	24(%esp),%edx
 	testl	%edx,%edx
 	jz	7f
 	movl	%ecx,(%edx)
 7:
 	popl	%edi
 	popl	%esi
 	ret
 END(copystr)
 
 ENTRY(bcmp)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%edx
 
 	movl	%edx,%ecx
 	shrl	$2,%ecx
 	repe
 	cmpsl
 	jne	1f
 
 	movl	%edx,%ecx
 	andl	$3,%ecx
 	repe
 	cmpsb
 1:
 	setne	%al
 	movsbl	%al,%eax
 	popl	%esi
 	popl	%edi
 	ret
 END(bcmp)
 
 /*
  * Handling of special 386 registers and descriptor tables etc
  */
 /* void lgdt(struct region_descriptor *rdp); */
 ENTRY(lgdt)
 	/* reload the descriptor table */
 	movl	4(%esp),%eax
 	lgdt	(%eax)
 
 	/* flush the prefetch q */
 	jmp	1f
 	nop
 1:
 	/* reload "stale" selectors */
 	movl	$KDSEL,%eax
 	movl	%eax,%ds
 	movl	%eax,%es
 	movl	%eax,%gs
 	movl	%eax,%ss
 	movl	$KPSEL,%eax
 	movl	%eax,%fs
 
 	/* reload code selector by turning return into intersegmental return */
 	movl	(%esp),%eax
 	pushl	%eax
 	movl	$KCSEL,4(%esp)
 	MEXITCOUNT
 	lret
 END(lgdt)
 
 /* ssdtosd(*ssdp,*sdp) */
 ENTRY(ssdtosd)
 	pushl	%ebx
 	movl	8(%esp),%ecx
 	movl	8(%ecx),%ebx
 	shll	$16,%ebx
 	movl	(%ecx),%edx
 	roll	$16,%edx
 	movb	%dh,%bl
 	movb	%dl,%bh
 	rorl	$8,%ebx
 	movl	4(%ecx),%eax
 	movw	%ax,%dx
 	andl	$0xf0000,%eax
 	orl	%eax,%ebx
 	movl	12(%esp),%ecx
 	movl	%edx,(%ecx)
 	movl	%ebx,4(%ecx)
 	popl	%ebx
 	ret
 END(ssdtosd)
 
 /* void reset_dbregs() */
 ENTRY(reset_dbregs)
 	movl	$0,%eax
 	movl	%eax,%dr7	/* disable all breakpoints first */
 	movl	%eax,%dr0
 	movl	%eax,%dr1
 	movl	%eax,%dr2
 	movl	%eax,%dr3
 	movl	%eax,%dr6
 	ret
 END(reset_dbregs)
 
 /*****************************************************************************/
 /* setjump, longjump                                                         */
 /*****************************************************************************/
 
 ENTRY(setjmp)
 	movl	4(%esp),%eax
 	movl	%ebx,(%eax)			/* save ebx */
 	movl	%esp,4(%eax)			/* save esp */
 	movl	%ebp,8(%eax)			/* save ebp */
 	movl	%esi,12(%eax)			/* save esi */
 	movl	%edi,16(%eax)			/* save edi */
 	movl	(%esp),%edx			/* get rta */
 	movl	%edx,20(%eax)			/* save eip */
 	xorl	%eax,%eax			/* return(0); */
 	ret
 END(setjmp)
 
 ENTRY(longjmp)
 	movl	4(%esp),%eax
 	movl	(%eax),%ebx			/* restore ebx */
 	movl	4(%eax),%esp			/* restore esp */
 	movl	8(%eax),%ebp			/* restore ebp */
 	movl	12(%eax),%esi			/* restore esi */
 	movl	16(%eax),%edi			/* restore edi */
 	movl	20(%eax),%edx			/* get rta */
 	movl	%edx,(%esp)			/* put in return frame */
 	xorl	%eax,%eax			/* return(1); */
 	incl	%eax
 	ret
 END(longjmp)
 
 /*
  * Support for reading MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(rdmsr_safe)
 /* int rdmsr_safe(u_int msr, uint64_t *data) */
 	movl	PCPU(CURPCB),%ecx
 	movl	$msr_onfault,PCB_ONFAULT(%ecx)
 
 	movl	4(%esp),%ecx
 	rdmsr
 	movl	8(%esp),%ecx
 	movl	%eax,(%ecx)
 	movl	%edx,4(%ecx)
 	xorl	%eax,%eax
 
 	movl	PCPU(CURPCB),%ecx
 	movl	%eax,PCB_ONFAULT(%ecx)
 
 	ret
 
 /*
  * Support for writing MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(wrmsr_safe)
 /* int wrmsr_safe(u_int msr, uint64_t data) */
 	movl	PCPU(CURPCB),%ecx
 	movl	$msr_onfault,PCB_ONFAULT(%ecx)
 
 	movl	4(%esp),%ecx
 	movl	8(%esp),%eax
 	movl	12(%esp),%edx
 	wrmsr
 	xorl	%eax,%eax
 
 	movl	PCPU(CURPCB),%ecx
 	movl	%eax,PCB_ONFAULT(%ecx)
 
 	ret
 
 /*
  * MSR operations fault handler
  */
 	ALIGN_TEXT
 msr_onfault:
 	movl	PCPU(CURPCB),%ecx
 	movl	$0,PCB_ONFAULT(%ecx)
 	movl	$EFAULT,%eax
 	ret
 
 ENTRY(handle_ibrs_entry)
 	ret
 END(handle_ibrs_entry)
 
 ENTRY(handle_ibrs_exit)
 	ret
 END(handle_ibrs_exit)
Index: head/sys/i386/i386/swtch.s
===================================================================
--- head/sys/i386/i386/swtch.s	(revision 332488)
+++ head/sys/i386/i386/swtch.s	(revision 332489)
@@ -1,471 +1,466 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_sched.h"
 
 #include <machine/asmacros.h>
 
 #include "assym.inc"
 
 #if defined(SMP) && defined(SCHED_ULE)
 #define	SETOP		xchgl
 #define	BLOCK_SPIN(reg)							\
 		movl		$blocked_lock,%eax ;			\
 	100: ;								\
 		lock ;							\
 		cmpxchgl	%eax,TD_LOCK(reg) ;			\
 		jne		101f ;					\
 		pause ;							\
 		jmp		100b ;					\
 	101:
 #else
 #define	SETOP		movl
 #define	BLOCK_SPIN(reg)
 #endif
 
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
 
 	.text
 
 /*
  * cpu_throw()
  *
  * This is the second half of cpu_switch(). It is used when the current
  * thread is either a dummy or slated to die, and we no longer care
  * about its state.  This is only a slight optimization and is probably
  * not worth it anymore.  Note that we need to clear the pm_active bits so
  * we do need the old proc if it still exists.
  * 0(%esp) = ret
  * 4(%esp) = oldtd
  * 8(%esp) = newtd
  */
 ENTRY(cpu_throw)
 	movl	PCPU(CPUID), %esi
 	movl	4(%esp),%ecx			/* Old thread */
 	testl	%ecx,%ecx			/* no thread? */
 	jz	1f
 	/* release bit from old pm_active */
 	movl	PCPU(CURPMAP), %ebx
 #ifdef SMP
 	lock
 #endif
 	btrl	%esi, PM_ACTIVE(%ebx)		/* clear old */
 1:
 	movl	8(%esp),%ecx			/* New thread */
 	movl	TD_PCB(%ecx),%edx
-	movl	PCB_CR3(%edx),%eax
-	movl	%eax,%cr3
 	/* set bit in new pm_active */
 	movl	TD_PROC(%ecx),%eax
 	movl	P_VMSPACE(%eax), %ebx
 	addl	$VM_PMAP, %ebx
 	movl	%ebx, PCPU(CURPMAP)
 #ifdef SMP
 	lock
 #endif
 	btsl	%esi, PM_ACTIVE(%ebx)		/* set new */
 	jmp	sw1
 END(cpu_throw)
 
 /*
  * cpu_switch(old, new)
  *
  * Save the current thread state, then select the next thread to run
  * and load its state.
  * 0(%esp) = ret
  * 4(%esp) = oldtd
  * 8(%esp) = newtd
  * 12(%esp) = newlock
  */
 ENTRY(cpu_switch)
 
 	/* Switch to new thread.  First, save context. */
 	movl	4(%esp),%ecx
 
 #ifdef INVARIANTS
 	testl	%ecx,%ecx			/* no thread? */
 	jz	badsw2				/* no, panic */
 #endif
 
 	movl	TD_PCB(%ecx),%edx
 
 	movl	(%esp),%eax			/* Hardware registers */
 	movl	%eax,PCB_EIP(%edx)
 	movl	%ebx,PCB_EBX(%edx)
 	movl	%esp,PCB_ESP(%edx)
 	movl	%ebp,PCB_EBP(%edx)
 	movl	%esi,PCB_ESI(%edx)
 	movl	%edi,PCB_EDI(%edx)
 	mov	%gs,PCB_GS(%edx)
 	/* Test if debug registers should be saved. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%edx)
 	jz      1f                              /* no, skip over */
 	movl    %dr7,%eax                       /* yes, do the save */
 	movl    %eax,PCB_DR7(%edx)
 	andl    $0x0000fc00, %eax               /* disable all watchpoints */
 	movl    %eax,%dr7
 	movl    %dr6,%eax
 	movl    %eax,PCB_DR6(%edx)
 	movl    %dr3,%eax
 	movl    %eax,PCB_DR3(%edx)
 	movl    %dr2,%eax
 	movl    %eax,PCB_DR2(%edx)
 	movl    %dr1,%eax
 	movl    %eax,PCB_DR1(%edx)
 	movl    %dr0,%eax
 	movl    %eax,PCB_DR0(%edx)
 1:
 
 	/* have we used fp, and need a save? */
 	cmpl	%ecx,PCPU(FPCURTHREAD)
 	jne	1f
 	pushl	PCB_SAVEFPU(%edx)		/* h/w bugs make saving complicated */
 	call	npxsave				/* do it in a big C function */
 	popl	%eax
 1:
 
-	/* Save is done.  Now fire up new thread. Leave old vmspace. */
+	/* Save is done.  Now fire up new thread. */
 	movl	4(%esp),%edi
 	movl	8(%esp),%ecx			/* New thread */
 	movl	12(%esp),%esi			/* New lock */
 #ifdef INVARIANTS
 	testl	%ecx,%ecx			/* no thread? */
 	jz	badsw3				/* no, panic */
 #endif
 	movl	TD_PCB(%ecx),%edx
 
-	/* switch address space */
-	movl	PCB_CR3(%edx),%eax
-	movl	%cr3,%ebx			/* The same address space? */
-	cmpl	%ebx,%eax
-	je	sw0
-	movl	%eax,%cr3			/* new address space */
+	/* Switchout td_lock */
 	movl	%esi,%eax
 	movl	PCPU(CPUID),%esi
-	SETOP	%eax,TD_LOCK(%edi)		/* Switchout td_lock */
+	SETOP	%eax,TD_LOCK(%edi)
 
 	/* Release bit from old pmap->pm_active */
 	movl	PCPU(CURPMAP), %ebx
 #ifdef SMP
 	lock
 #endif
 	btrl	%esi, PM_ACTIVE(%ebx)		/* clear old */
 
 	/* Set bit in new pmap->pm_active */
 	movl	TD_PROC(%ecx),%eax		/* newproc */
 	movl	P_VMSPACE(%eax), %ebx
 	addl	$VM_PMAP, %ebx
 	movl	%ebx, PCPU(CURPMAP)
 #ifdef SMP
 	lock
 #endif
 	btsl	%esi, PM_ACTIVE(%ebx)		/* set new */
 	jmp	sw1
 
 sw0:
 	SETOP	%esi,TD_LOCK(%edi)		/* Switchout td_lock */
 sw1:
 	BLOCK_SPIN(%ecx)
 	/*
-	 * At this point, we've switched address spaces and are ready
+	 * At this point, we have managed thread locks and are ready
 	 * to load up the rest of the next context.
 	 */
+
+	/* Load a pointer to the thread kernel stack into PCPU. */
+	leal	-VM86_STACK_SPACE(%edx), %eax	/* leave space for vm86 */
+	movl	%eax, PCPU(KESP0)
+
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f				/* If not, use the default */
 	movl	$1, PCPU(PRIVATE_TSS) 		/* mark use of private tss */
 	movl	PCB_EXT(%edx), %edi		/* new tss descriptor */
+	movl	PCPU(TRAMPSTK), %ebx
+	movl	%ebx, PCB_EXT_TSS+TSS_ESP0(%edi)
 	jmp	2f				/* Load it up */
 
 1:	/*
 	 * Use the common default TSS instead of our own.
-	 * Set our stack pointer into the TSS, it's set to just
-	 * below the PCB.  In C, common_tss.tss_esp0 = &pcb - 16;
-	 */
-	leal	-16(%edx), %ebx			/* leave space for vm86 */
-	movl	%ebx, PCPU(COMMON_TSS) + TSS_ESP0
-
-	/*
-	 * Test this CPU's  bit in the bitmap to see if this
-	 * CPU was using a private TSS.
+	 * Stack pointer in the common TSS points to the trampoline stack
+	 * already and should be not changed.
+	 *
+	 * Test this CPU's flag to see if this CPU was using a private TSS.
 	 */
 	cmpl	$0, PCPU(PRIVATE_TSS)		/* Already using the common? */
 	je	3f				/* if so, skip reloading */
 	movl	$0, PCPU(PRIVATE_TSS)
 	PCPU_ADDR(COMMON_TSSD, %edi)
 2:
 	/* Move correct tss descriptor into GDT slot, then reload tr. */
 	movl	PCPU(TSS_GDT), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	4(%edi), %esi
 	movl	%eax, 0(%ebx)
 	movl	%esi, 4(%ebx)
 	movl	$GPROC0_SEL*8, %esi		/* GSEL(GPROC0_SEL, SEL_KPL) */
 	ltr	%si
 3:
 
 	/* Copy the %fs and %gs selectors into this pcpu gdt */
 	leal	PCB_FSD(%edx), %esi
 	movl	PCPU(FSGS_GDT), %edi
 	movl	0(%esi), %eax		/* %fs selector */
 	movl	4(%esi), %ebx
 	movl	%eax, 0(%edi)
 	movl	%ebx, 4(%edi)
 	movl	8(%esi), %eax		/* %gs selector, comes straight after */
 	movl	12(%esi), %ebx
 	movl	%eax, 8(%edi)
 	movl	%ebx, 12(%edi)
 
 	/* Restore context. */
 	movl	PCB_EBX(%edx),%ebx
 	movl	PCB_ESP(%edx),%esp
 	movl	PCB_EBP(%edx),%ebp
 	movl	PCB_ESI(%edx),%esi
 	movl	PCB_EDI(%edx),%edi
 	movl	PCB_EIP(%edx),%eax
 	movl	%eax,(%esp)
 
 	movl	%edx, PCPU(CURPCB)
 	movl	%ecx, PCPU(CURTHREAD)		/* into next thread */
 
 	/*
 	 * Determine the LDT to use and load it if is the default one and
 	 * that is not the current one.
 	 */
 	movl	TD_PROC(%ecx),%eax
 	cmpl    $0,P_MD+MD_LDT(%eax)
 	jnz	1f
 	movl	_default_ldt,%eax
 	cmpl	PCPU(CURRENTLDT),%eax
 	je	2f
 	lldt	_default_ldt
 	movl	%eax,PCPU(CURRENTLDT)
 	jmp	2f
 1:
 	/* Load the LDT when it is not the default one. */
 	pushl	%edx				/* Preserve pointer to pcb. */
 	addl	$P_MD,%eax			/* Pointer to mdproc is arg. */
 	pushl	%eax
 	/*
 	 * Holding dt_lock prevents context switches, so dt_lock cannot
 	 * be held now and set_user_ldt() will not deadlock acquiring it.
 	 */
 	call	set_user_ldt
 	addl	$4,%esp
 	popl	%edx
 2:
 
 	/* This must be done after loading the user LDT. */
 	.globl	cpu_switch_load_gs
 cpu_switch_load_gs:
 	mov	PCB_GS(%edx),%gs
 
 	/* Test if debug registers should be restored. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%edx)
 	jz      1f
 
 	/*
 	 * Restore debug registers.  The special code for dr7 is to
 	 * preserve the current values of its reserved bits.
 	 */
 	movl    PCB_DR6(%edx),%eax
 	movl    %eax,%dr6
 	movl    PCB_DR3(%edx),%eax
 	movl    %eax,%dr3
 	movl    PCB_DR2(%edx),%eax
 	movl    %eax,%dr2
 	movl    PCB_DR1(%edx),%eax
 	movl    %eax,%dr1
 	movl    PCB_DR0(%edx),%eax
 	movl    %eax,%dr0
 	movl	%dr7,%eax
 	andl    $0x0000fc00,%eax
 	movl    PCB_DR7(%edx),%ecx
 	andl	$~0x0000fc00,%ecx
 	orl     %ecx,%eax
 	movl    %eax,%dr7
 1:
 	ret
 
 #ifdef INVARIANTS
 badsw1:
 	pushal
 	pushl	$sw0_1
 	call	panic
 sw0_1:	.asciz	"cpu_throw: no newthread supplied"
 
 badsw2:
 	pushal
 	pushl	$sw0_2
 	call	panic
 sw0_2:	.asciz	"cpu_switch: no curthread supplied"
 
 badsw3:
 	pushal
 	pushl	$sw0_3
 	call	panic
 sw0_3:	.asciz	"cpu_switch: no newthread supplied"
 #endif
 END(cpu_switch)
 
 /*
  * savectx(pcb)
  * Update pcb, saving current processor state.
  */
 ENTRY(savectx)
 	/* Fetch PCB. */
 	movl	4(%esp),%ecx
 
 	/* Save caller's return address.  Child won't execute this routine. */
 	movl	(%esp),%eax
 	movl	%eax,PCB_EIP(%ecx)
 
 	movl	%cr3,%eax
 	movl	%eax,PCB_CR3(%ecx)
 
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	mov	%gs,PCB_GS(%ecx)
 
 	movl	%cr0,%eax
 	movl	%eax,PCB_CR0(%ecx)
 	movl	%cr2,%eax
 	movl	%eax,PCB_CR2(%ecx)
 	movl	%cr4,%eax
 	movl	%eax,PCB_CR4(%ecx)
 
 	movl	%dr0,%eax
 	movl	%eax,PCB_DR0(%ecx)
 	movl	%dr1,%eax
 	movl	%eax,PCB_DR1(%ecx)
 	movl	%dr2,%eax
 	movl	%eax,PCB_DR2(%ecx)
 	movl	%dr3,%eax
 	movl	%eax,PCB_DR3(%ecx)
 	movl	%dr6,%eax
 	movl	%eax,PCB_DR6(%ecx)
 	movl	%dr7,%eax
 	movl	%eax,PCB_DR7(%ecx)
 
 	mov	%ds,PCB_DS(%ecx)
 	mov	%es,PCB_ES(%ecx)
 	mov	%fs,PCB_FS(%ecx)
 	mov	%ss,PCB_SS(%ecx)
 	
 	sgdt	PCB_GDT(%ecx)
 	sidt	PCB_IDT(%ecx)
 	sldt	PCB_LDT(%ecx)
 	str	PCB_TR(%ecx)
 
 	movl	$1,%eax
 	ret
 END(savectx)
 
 /*
  * resumectx(pcb) __fastcall
  * Resuming processor state from pcb.
  */
 ENTRY(resumectx)
 	/* Restore GDT. */
 	lgdt	PCB_GDT(%ecx)
 
 	/* Restore segment registers */
 	movzwl	PCB_DS(%ecx),%eax
 	mov	%ax,%ds
 	movzwl	PCB_ES(%ecx),%eax
 	mov	%ax,%es
 	movzwl	PCB_FS(%ecx),%eax
 	mov	%ax,%fs
 	movzwl	PCB_GS(%ecx),%eax
 	movw	%ax,%gs
 	movzwl	PCB_SS(%ecx),%eax
 	mov	%ax,%ss
 
 	/* Restore CR2, CR4, CR3 and CR0 */
 	movl	PCB_CR2(%ecx),%eax
 	movl	%eax,%cr2
 	movl	PCB_CR4(%ecx),%eax
 	movl	%eax,%cr4
 	movl	PCB_CR3(%ecx),%eax
 	movl	%eax,%cr3
 	movl	PCB_CR0(%ecx),%eax
 	movl	%eax,%cr0
 	jmp	1f
 1:
 
 	/* Restore descriptor tables */
 	lidt	PCB_IDT(%ecx)
 	lldt	PCB_LDT(%ecx)
 
 #define SDT_SYS386TSS	9
 #define SDT_SYS386BSY	11
 	/* Clear "task busy" bit and reload TR */
 	movl	PCPU(TSS_GDT),%eax
 	andb	$(~SDT_SYS386BSY | SDT_SYS386TSS),5(%eax)
 	movzwl	PCB_TR(%ecx),%eax
 	ltr	%ax
 #undef SDT_SYS386TSS
 #undef SDT_SYS386BSY
 
 	/* Restore debug registers */
 	movl	PCB_DR0(%ecx),%eax
 	movl	%eax,%dr0
 	movl	PCB_DR1(%ecx),%eax
 	movl	%eax,%dr1
 	movl	PCB_DR2(%ecx),%eax
 	movl	%eax,%dr2
 	movl	PCB_DR3(%ecx),%eax
 	movl	%eax,%dr3
 	movl	PCB_DR6(%ecx),%eax
 	movl	%eax,%dr6
 	movl	PCB_DR7(%ecx),%eax
 	movl	%eax,%dr7
 
 	/* Restore other registers */
 	movl	PCB_EDI(%ecx),%edi
 	movl	PCB_ESI(%ecx),%esi
 	movl	PCB_EBP(%ecx),%ebp
 	movl	PCB_ESP(%ecx),%esp
 	movl	PCB_EBX(%ecx),%ebx
 
 	/* reload code selector by turning return into intersegmental return */
 	pushl	PCB_EIP(%ecx)
 	movl	$KCSEL,4(%esp)
 	xorl	%eax,%eax
 	lret
 END(resumectx)
Index: head/sys/i386/i386/sys_machdep.c
===================================================================
--- head/sys/i386/i386/sys_machdep.c	(revision 332488)
+++ head/sys/i386/i386/sys_machdep.c	(revision 332489)
@@ -1,811 +1,809 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)sys_machdep.c	5.5 (Berkeley) 1/19/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysproto.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/sysarch.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/vm_kern.h>		/* for kernel_map */
 
 #define MAX_LD 8192
 #define LD_PER_PAGE 512
 #define	NEW_MAX_LD(num)  rounddown2(num + LD_PER_PAGE, LD_PER_PAGE)
 #define SIZE_FROM_LARGEST_LD(num) (NEW_MAX_LD(num) << 3)
 #define	NULL_LDT_BASE	((caddr_t)NULL)
 
 #ifdef SMP
 static void set_user_ldt_rv(void *arg);
 #endif
 static int i386_set_ldt_data(struct thread *, int start, int num,
     union descriptor *descs);
 static int i386_ldt_grow(struct thread *td, int len);
 
 void
 fill_based_sd(struct segment_descriptor *sdp, uint32_t base)
 {
 
 	sdp->sd_lobase = base & 0xffffff;
 	sdp->sd_hibase = (base >> 24) & 0xff;
 	sdp->sd_lolimit = 0xffff;	/* 4GB limit, wraps around */
 	sdp->sd_hilimit = 0xf;
 	sdp->sd_type = SDT_MEMRWA;
 	sdp->sd_dpl = SEL_UPL;
 	sdp->sd_p = 1;
 	sdp->sd_xx = 0;
 	sdp->sd_def32 = 1;
 	sdp->sd_gran = 1;
 }
 
 /*
  * Construct special descriptors for "base" selectors.  Store them in
  * the PCB for later use by cpu_switch().  Store them in the GDT for
  * more immediate use.  The GDT entries are part of the current
  * context.  Callers must load related segment registers to complete
  * setting up the current context.
  */
 void
 set_fsbase(struct thread *td, uint32_t base)
 {
 	struct segment_descriptor sd;
 
 	fill_based_sd(&sd, base);
 	critical_enter();
 	td->td_pcb->pcb_fsd = sd;
 	PCPU_GET(fsgs_gdt)[0] = sd;
 	critical_exit();
 }
 
 void
 set_gsbase(struct thread *td, uint32_t base)
 {
 	struct segment_descriptor sd;
 
 	fill_based_sd(&sd, base);
 	critical_enter();
 	td->td_pcb->pcb_gsd = sd;
 	PCPU_GET(fsgs_gdt)[1] = sd;
 	critical_exit();
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sysarch_args {
 	int op;
 	char *parms;
 };
 #endif
 
 int
 sysarch(struct thread *td, struct sysarch_args *uap)
 {
 	int error;
 	union descriptor *lp;
 	union {
 		struct i386_ldt_args largs;
 		struct i386_ioperm_args iargs;
 		struct i386_get_xfpustate xfpu;
 	} kargs;
 	uint32_t base;
 	struct segment_descriptor *sdp;
 
 	AUDIT_ARG_CMD(uap->op);
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * When adding new operations, add a new case statement here to
 	 * explicitly indicate whether or not the operation is safe to
 	 * perform in capability mode.
 	 */
 	if (IN_CAPABILITY_MODE(td)) {
 		switch (uap->op) {
 		case I386_GET_LDT:
 		case I386_SET_LDT:
 		case I386_GET_IOPERM:
 		case I386_GET_FSBASE:
 		case I386_SET_FSBASE:
 		case I386_GET_GSBASE:
 		case I386_SET_GSBASE:
 		case I386_GET_XFPUSTATE:
 			break;
 
 		case I386_SET_IOPERM:
 		default:
 #ifdef KTRACE
 			if (KTRPOINT(td, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_SYSCALL, NULL, NULL);
 #endif
 			return (ECAPMODE);
 		}
 	}
 #endif
 
 	switch (uap->op) {
 	case I386_GET_IOPERM:
 	case I386_SET_IOPERM:
 		if ((error = copyin(uap->parms, &kargs.iargs,
 		    sizeof(struct i386_ioperm_args))) != 0)
 			return (error);
 		break;
 	case I386_GET_LDT:
 	case I386_SET_LDT:
 		if ((error = copyin(uap->parms, &kargs.largs,
 		    sizeof(struct i386_ldt_args))) != 0)
 			return (error);
 		break;
 	case I386_GET_XFPUSTATE:
 		if ((error = copyin(uap->parms, &kargs.xfpu,
 		    sizeof(struct i386_get_xfpustate))) != 0)
 			return (error);
 		break;
 	default:
 		break;
 	}
 
 	switch (uap->op) {
 	case I386_GET_LDT:
 		error = i386_get_ldt(td, &kargs.largs);
 		break;
 	case I386_SET_LDT:
 		if (kargs.largs.descs != NULL) {
 			if (kargs.largs.num > MAX_LD)
 				return (EINVAL);
 			lp = malloc(kargs.largs.num * sizeof(union descriptor),
 			    M_TEMP, M_WAITOK);
 			error = copyin(kargs.largs.descs, lp,
 			    kargs.largs.num * sizeof(union descriptor));
 			if (error == 0)
 				error = i386_set_ldt(td, &kargs.largs, lp);
 			free(lp, M_TEMP);
 		} else {
 			error = i386_set_ldt(td, &kargs.largs, NULL);
 		}
 		break;
 	case I386_GET_IOPERM:
 		error = i386_get_ioperm(td, &kargs.iargs);
 		if (error == 0)
 			error = copyout(&kargs.iargs, uap->parms,
 			    sizeof(struct i386_ioperm_args));
 		break;
 	case I386_SET_IOPERM:
 		error = i386_set_ioperm(td, &kargs.iargs);
 		break;
 	case I386_VM86:
 		error = vm86_sysarch(td, uap->parms);
 		break;
 	case I386_GET_FSBASE:
 		sdp = &td->td_pcb->pcb_fsd;
 		base = sdp->sd_hibase << 24 | sdp->sd_lobase;
 		error = copyout(&base, uap->parms, sizeof(base));
 		break;
 	case I386_SET_FSBASE:
 		error = copyin(uap->parms, &base, sizeof(base));
 		if (error == 0) {
 			/*
 			 * Construct the special descriptor for fsbase
 			 * and arrange for doreti to load its selector
 			 * soon enough.
 			 */
 			set_fsbase(td, base);
 			td->td_frame->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
 		}
 		break;
 	case I386_GET_GSBASE:
 		sdp = &td->td_pcb->pcb_gsd;
 		base = sdp->sd_hibase << 24 | sdp->sd_lobase;
 		error = copyout(&base, uap->parms, sizeof(base));
 		break;
 	case I386_SET_GSBASE:
 		error = copyin(uap->parms, &base, sizeof(base));
 		if (error == 0) {
 			/*
 			 * Construct the special descriptor for gsbase.
 			 * The selector is loaded immediately, since we
 			 * normally only reload %gs on context switches.
 			 */
 			set_gsbase(td, base);
 			load_gs(GSEL(GUGS_SEL, SEL_UPL));
 		}
 		break;
 	case I386_GET_XFPUSTATE:
 		if (kargs.xfpu.len > cpu_max_ext_state_size -
 		    sizeof(union savefpu))
 			return (EINVAL);
 		npxgetregs(td);
 		error = copyout((char *)(get_pcb_user_save_td(td) + 1),
 		    kargs.xfpu.addr, kargs.xfpu.len);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 i386_extend_pcb(struct thread *td)
 {
 	int i, offset;
 	u_long *addr;
 	struct pcb_ext *ext;
 	struct soft_segment_descriptor ssd = {
 		0,			/* segment base address (overwritten) */
 		ctob(IOPAGES + 1) - 1,	/* length */
 		SDT_SYS386TSS,		/* segment type */
 		0,			/* priority level */
 		1,			/* descriptor present */
 		0, 0,
 		0,			/* default 32 size */
 		0			/* granularity */
 	};
 
-	ext = (struct pcb_ext *)kmem_malloc(kernel_arena, ctob(IOPAGES+1),
-	    M_WAITOK | M_ZERO);
+	ext = pmap_trm_alloc(ctob(IOPAGES + 1), M_WAITOK | M_ZERO);
 	/* -16 is so we can convert a trapframe into vm86trapframe inplace */
-	ext->ext_tss.tss_esp0 = (vm_offset_t)td->td_pcb - 16;
 	ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
 	/*
 	 * The last byte of the i/o map must be followed by an 0xff byte.
 	 * We arbitrarily allocate 16 bytes here, to keep the starting
 	 * address on a doubleword boundary.
 	 */
 	offset = PAGE_SIZE - 16;
 	ext->ext_tss.tss_ioopt = 
 	    (offset - ((unsigned)&ext->ext_tss - (unsigned)ext)) << 16;
 	ext->ext_iomap = (caddr_t)ext + offset;
 	ext->ext_vm86.vm86_intmap = (caddr_t)ext + offset - 32;
 
 	addr = (u_long *)ext->ext_vm86.vm86_intmap;
 	for (i = 0; i < (ctob(IOPAGES) + 32 + 16) / sizeof(u_long); i++)
 		*addr++ = ~0;
 
 	ssd.ssd_base = (unsigned)&ext->ext_tss;
 	ssd.ssd_limit -= ((unsigned)&ext->ext_tss - (unsigned)ext);
 	ssdtosd(&ssd, &ext->ext_tssd);
 
 	KASSERT(td == curthread, ("giving TSS to !curthread"));
 	KASSERT(td->td_pcb->pcb_ext == 0, ("already have a TSS!"));
 
 	/* Switch to the new TSS. */
 	critical_enter();
+	ext->ext_tss.tss_esp0 = PCPU_GET(trampstk);
 	td->td_pcb->pcb_ext = ext;
 	PCPU_SET(private_tss, 1);
 	*PCPU_GET(tss_gdt) = ext->ext_tssd;
 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
 	critical_exit();
 
 	return 0;
 }
 
 int
 i386_set_ioperm(td, uap)
 	struct thread *td;
 	struct i386_ioperm_args *uap;
 {
 	char *iomap;
 	u_int i;
 	int error;
 
 	if ((error = priv_check(td, PRIV_IO)) != 0)
 		return (error);
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
 		return (error);
 	/*
 	 * XXX 
 	 * While this is restricted to root, we should probably figure out
 	 * whether any other driver is using this i/o address, as so not to
 	 * cause confusion.  This probably requires a global 'usage registry'.
 	 */
 
 	if (td->td_pcb->pcb_ext == 0)
 		if ((error = i386_extend_pcb(td)) != 0)
 			return (error);
 	iomap = (char *)td->td_pcb->pcb_ext->ext_iomap;
 
 	if (uap->start > uap->start + uap->length ||
 	    uap->start + uap->length > IOPAGES * PAGE_SIZE * NBBY)
 		return (EINVAL);
 
 	for (i = uap->start; i < uap->start + uap->length; i++) {
 		if (uap->enable)
 			iomap[i >> 3] &= ~(1 << (i & 7));
 		else
 			iomap[i >> 3] |= (1 << (i & 7));
 	}
 	return (error);
 }
 
 int
 i386_get_ioperm(td, uap)
 	struct thread *td;
 	struct i386_ioperm_args *uap;
 {
 	int i, state;
 	char *iomap;
 
 	if (uap->start >= IOPAGES * PAGE_SIZE * NBBY)
 		return (EINVAL);
 
 	if (td->td_pcb->pcb_ext == 0) {
 		uap->length = 0;
 		goto done;
 	}
 
 	iomap = (char *)td->td_pcb->pcb_ext->ext_iomap;
 
 	i = uap->start;
 	state = (iomap[i >> 3] >> (i & 7)) & 1;
 	uap->enable = !state;
 	uap->length = 1;
 
 	for (i = uap->start + 1; i < IOPAGES * PAGE_SIZE * NBBY; i++) {
 		if (state != ((iomap[i >> 3] >> (i & 7)) & 1))
 			break;
 		uap->length++;
 	}
 
 done:
 	return (0);
 }
 
 /*
  * Update the GDT entry pointing to the LDT to point to the LDT of the
  * current process. Manage dt_lock holding/unholding autonomously.
  */   
 static void
 set_user_ldt_locked(struct mdproc *mdp)
 {
 	struct proc_ldt *pldt;
 	int gdt_idx;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 
 	pldt = mdp->md_ldt;
 	gdt_idx = GUSERLDT_SEL;
 	gdt_idx += PCPU_GET(cpuid) * NGDT;	/* always 0 on UP */
 	gdt[gdt_idx].sd = pldt->ldt_sd;
 	lldt(GSEL(GUSERLDT_SEL, SEL_KPL));
 	PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL));
 }
 
 void
 set_user_ldt(struct mdproc *mdp)
 {
 
 	mtx_lock_spin(&dt_lock);
 	set_user_ldt_locked(mdp);
 	mtx_unlock_spin(&dt_lock);
 }
 
 #ifdef SMP
 static void
 set_user_ldt_rv(void *arg)
 {
 	struct proc *p;
 
 	p = curproc;
 	if (arg == p->p_vmspace)
 		set_user_ldt(&p->p_md);
 }
 #endif
 
 /*
  * dt_lock must be held. Returns with dt_lock held.
  */
 struct proc_ldt *
 user_ldt_alloc(struct mdproc *mdp, int len)
 {
 	struct proc_ldt *pldt, *new_ldt;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 	mtx_unlock_spin(&dt_lock);
 	new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK);
 
 	new_ldt->ldt_len = len = NEW_MAX_LD(len);
-	new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena,
-	    len * sizeof(union descriptor), M_WAITOK | M_ZERO);
+	new_ldt->ldt_base = pmap_trm_alloc(len * sizeof(union descriptor),
+	    M_WAITOK | M_ZERO);
 	new_ldt->ldt_refcnt = 1;
 	new_ldt->ldt_active = 0;
 
 	mtx_lock_spin(&dt_lock);
 	gdt_segs[GUSERLDT_SEL].ssd_base = (unsigned)new_ldt->ldt_base;
 	gdt_segs[GUSERLDT_SEL].ssd_limit = len * sizeof(union descriptor) - 1;
 	ssdtosd(&gdt_segs[GUSERLDT_SEL], &new_ldt->ldt_sd);
 
 	if ((pldt = mdp->md_ldt) != NULL) {
 		if (len > pldt->ldt_len)
 			len = pldt->ldt_len;
 		bcopy(pldt->ldt_base, new_ldt->ldt_base,
 		    len * sizeof(union descriptor));
 	} else
-		bcopy(ldt, new_ldt->ldt_base, sizeof(ldt));
+		bcopy(ldt, new_ldt->ldt_base, sizeof(union descriptor) * NLDT);
 	
 	return (new_ldt);
 }
 
 /*
  * Must be called with dt_lock held.  Returns with dt_lock unheld.
  */
 void
 user_ldt_free(struct thread *td)
 {
 	struct mdproc *mdp;
 	struct proc_ldt *pldt;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 	mdp = &td->td_proc->p_md;
 	if ((pldt = mdp->md_ldt) == NULL) {
 		mtx_unlock_spin(&dt_lock);
 		return;
 	}
 
 	if (td == curthread) {
 		lldt(_default_ldt);
 		PCPU_SET(currentldt, _default_ldt);
 	}
 
 	mdp->md_ldt = NULL;
 	user_ldt_deref(pldt);
 }
 
 void
 user_ldt_deref(struct proc_ldt *pldt)
 {
 
 	mtx_assert(&dt_lock, MA_OWNED);
 	if (--pldt->ldt_refcnt == 0) {
 		mtx_unlock_spin(&dt_lock);
-		kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base,
-			pldt->ldt_len * sizeof(union descriptor));
+		pmap_trm_free(pldt->ldt_base, pldt->ldt_len *
+		    sizeof(union descriptor));
 		free(pldt, M_SUBPROC);
 	} else
 		mtx_unlock_spin(&dt_lock);
 }
 
 /*
  * Note for the authors of compat layers (linux, etc): copyout() in
  * the function below is not a problem since it presents data in
  * arch-specific format (i.e. i386-specific in this case), not in
  * the OS-specific one.
  */
 int
 i386_get_ldt(struct thread *td, struct i386_ldt_args *uap)
 {
 	struct proc_ldt *pldt;
 	char *data;
 	u_int nldt, num;
 	int error;
 
 #ifdef DEBUG
 	printf("i386_get_ldt: start=%u num=%u descs=%p\n",
 	    uap->start, uap->num, (void *)uap->descs);
 #endif
 
 	num = min(uap->num, MAX_LD);
 	data = malloc(num * sizeof(union descriptor), M_TEMP, M_WAITOK);
 	mtx_lock_spin(&dt_lock);
 	pldt = td->td_proc->p_md.md_ldt;
 	nldt = pldt != NULL ? pldt->ldt_len : nitems(ldt);
 	if (uap->start >= nldt) {
 		num = 0;
 	} else {
 		num = min(num, nldt - uap->start);
 		bcopy(pldt != NULL ?
 		    &((union descriptor *)(pldt->ldt_base))[uap->start] :
 		    &ldt[uap->start], data, num * sizeof(union descriptor));
 	}
 	mtx_unlock_spin(&dt_lock);
 	error = copyout(data, uap->descs, num * sizeof(union descriptor));
 	if (error == 0)
 		td->td_retval[0] = num;
 	free(data, M_TEMP);
 	return (error);
 }
 
 int
 i386_set_ldt(struct thread *td, struct i386_ldt_args *uap,
     union descriptor *descs)
 {
 	struct mdproc *mdp;
 	struct proc_ldt *pldt;
 	union descriptor *dp;
 	u_int largest_ld, i;
 	int error;
 
 #ifdef DEBUG
 	printf("i386_set_ldt: start=%u num=%u descs=%p\n",
 	    uap->start, uap->num, (void *)uap->descs);
 #endif
 	error = 0;
 	mdp = &td->td_proc->p_md;
 
 	if (descs == NULL) {
 		/* Free descriptors */
 		if (uap->start == 0 && uap->num == 0) {
 			/*
 			 * Treat this as a special case, so userland needn't
 			 * know magic number NLDT.
 			 */
 			uap->start = NLDT;
 			uap->num = MAX_LD - NLDT;
 		}
 		mtx_lock_spin(&dt_lock);
 		if ((pldt = mdp->md_ldt) == NULL ||
 		    uap->start >= pldt->ldt_len) {
 			mtx_unlock_spin(&dt_lock);
 			return (0);
 		}
 		largest_ld = uap->start + uap->num;
 		if (largest_ld > pldt->ldt_len)
 			largest_ld = pldt->ldt_len;
 		for (i = uap->start; i < largest_ld; i++)
 			atomic_store_rel_64(&((uint64_t *)(pldt->ldt_base))[i],
 			    0);
 		mtx_unlock_spin(&dt_lock);
 		return (0);
 	}
 
 	if (uap->start != LDT_AUTO_ALLOC || uap->num != 1) {
 		/* verify range of descriptors to modify */
 		largest_ld = uap->start + uap->num;
 		if (uap->start >= MAX_LD || largest_ld > MAX_LD)
 			return (EINVAL);
 	}
 
 	/* Check descriptors for access violations */
 	for (i = 0; i < uap->num; i++) {
 		dp = &descs[i];
 
 		switch (dp->sd.sd_type) {
 		case SDT_SYSNULL:	/* system null */ 
 			dp->sd.sd_p = 0;
 			break;
 		case SDT_SYS286TSS: /* system 286 TSS available */
 		case SDT_SYSLDT:    /* system local descriptor table */
 		case SDT_SYS286BSY: /* system 286 TSS busy */
 		case SDT_SYSTASKGT: /* system task gate */
 		case SDT_SYS286IGT: /* system 286 interrupt gate */
 		case SDT_SYS286TGT: /* system 286 trap gate */
 		case SDT_SYSNULL2:  /* undefined by Intel */ 
 		case SDT_SYS386TSS: /* system 386 TSS available */
 		case SDT_SYSNULL3:  /* undefined by Intel */
 		case SDT_SYS386BSY: /* system 386 TSS busy */
 		case SDT_SYSNULL4:  /* undefined by Intel */ 
 		case SDT_SYS386IGT: /* system 386 interrupt gate */
 		case SDT_SYS386TGT: /* system 386 trap gate */
 		case SDT_SYS286CGT: /* system 286 call gate */ 
 		case SDT_SYS386CGT: /* system 386 call gate */
 			return (EACCES);
 
 		/* memory segment types */
 		case SDT_MEMEC:   /* memory execute only conforming */
 		case SDT_MEMEAC:  /* memory execute only accessed conforming */
 		case SDT_MEMERC:  /* memory execute read conforming */
 		case SDT_MEMERAC: /* memory execute read accessed conforming */
 			 /* Must be "present" if executable and conforming. */
 			if (dp->sd.sd_p == 0)
 				return (EACCES);
 			break;
 		case SDT_MEMRO:   /* memory read only */
 		case SDT_MEMROA:  /* memory read only accessed */
 		case SDT_MEMRW:   /* memory read write */
 		case SDT_MEMRWA:  /* memory read write accessed */
 		case SDT_MEMROD:  /* memory read only expand dwn limit */
 		case SDT_MEMRODA: /* memory read only expand dwn lim accessed */
 		case SDT_MEMRWD:  /* memory read write expand dwn limit */  
 		case SDT_MEMRWDA: /* memory read write expand dwn lim acessed */
 		case SDT_MEME:    /* memory execute only */ 
 		case SDT_MEMEA:   /* memory execute only accessed */
 		case SDT_MEMER:   /* memory execute read */
 		case SDT_MEMERA:  /* memory execute read accessed */
 			break;
 		default:
 			return (EINVAL);
 		}
 
 		/* Only user (ring-3) descriptors may be present. */
 		if (dp->sd.sd_p != 0 && dp->sd.sd_dpl != SEL_UPL)
 			return (EACCES);
 	}
 
 	if (uap->start == LDT_AUTO_ALLOC && uap->num == 1) {
 		/* Allocate a free slot */
 		mtx_lock_spin(&dt_lock);
 		if ((pldt = mdp->md_ldt) == NULL) {
 			if ((error = i386_ldt_grow(td, NLDT + 1))) {
 				mtx_unlock_spin(&dt_lock);
 				return (error);
 			}
 			pldt = mdp->md_ldt;
 		}
 again:
 		/*
 		 * start scanning a bit up to leave room for NVidia and
 		 * Wine, which still user the "Blat" method of allocation.
 		 */
 		dp = &((union descriptor *)(pldt->ldt_base))[NLDT];
 		for (i = NLDT; i < pldt->ldt_len; ++i) {
 			if (dp->sd.sd_type == SDT_SYSNULL)
 				break;
 			dp++;
 		}
 		if (i >= pldt->ldt_len) {
 			if ((error = i386_ldt_grow(td, pldt->ldt_len+1))) {
 				mtx_unlock_spin(&dt_lock);
 				return (error);
 			}
 			goto again;
 		}
 		uap->start = i;
 		error = i386_set_ldt_data(td, i, 1, descs);
 		mtx_unlock_spin(&dt_lock);
 	} else {
 		largest_ld = uap->start + uap->num;
 		mtx_lock_spin(&dt_lock);
 		if (!(error = i386_ldt_grow(td, largest_ld))) {
 			error = i386_set_ldt_data(td, uap->start, uap->num,
 			    descs);
 		}
 		mtx_unlock_spin(&dt_lock);
 	}
 	if (error == 0)
 		td->td_retval[0] = uap->start;
 	return (error);
 }
 
 static int
 i386_set_ldt_data(struct thread *td, int start, int num,
     union descriptor *descs)
 {
 	struct mdproc *mdp;
 	struct proc_ldt *pldt;
 	uint64_t *dst, *src;
 	int i;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 
 	mdp = &td->td_proc->p_md;
 	pldt = mdp->md_ldt;
 	dst = (uint64_t *)(pldt->ldt_base);
 	src = (uint64_t *)descs;
 
 	/*
 	 * Atomic(9) is used only to get 64bit atomic store with
 	 * cmpxchg8b when available.  There is no op without release
 	 * semantic.
 	 */
 	for (i = 0; i < num; i++)
 		atomic_store_rel_64(&dst[start + i], src[i]);
 	return (0);
 }
 
 static int
 i386_ldt_grow(struct thread *td, int len) 
 {
 	struct mdproc *mdp;
 	struct proc_ldt *new_ldt, *pldt;
 	caddr_t old_ldt_base;
 	int old_ldt_len;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 
 	if (len > MAX_LD)
 		return (ENOMEM);
 	if (len < NLDT + 1)
 		len = NLDT + 1;
 
 	mdp = &td->td_proc->p_md;
 	old_ldt_base = NULL_LDT_BASE;
 	old_ldt_len = 0;
 
 	/* Allocate a user ldt. */
 	if ((pldt = mdp->md_ldt) == NULL || len > pldt->ldt_len) {
 		new_ldt = user_ldt_alloc(mdp, len);
 		if (new_ldt == NULL)
 			return (ENOMEM);
 		pldt = mdp->md_ldt;
 
 		if (pldt != NULL) {
 			if (new_ldt->ldt_len <= pldt->ldt_len) {
 				/*
 				 * We just lost the race for allocation, so
 				 * free the new object and return.
 				 */
 				mtx_unlock_spin(&dt_lock);
-				kmem_free(kernel_arena,
-				   (vm_offset_t)new_ldt->ldt_base,
+				pmap_trm_free(new_ldt->ldt_base,
 				   new_ldt->ldt_len * sizeof(union descriptor));
 				free(new_ldt, M_SUBPROC);
 				mtx_lock_spin(&dt_lock);
 				return (0);
 			}
 
 			/*
 			 * We have to substitute the current LDT entry for
 			 * curproc with the new one since its size grew.
 			 */
 			old_ldt_base = pldt->ldt_base;
 			old_ldt_len = pldt->ldt_len;
 			pldt->ldt_sd = new_ldt->ldt_sd;
 			pldt->ldt_base = new_ldt->ldt_base;
 			pldt->ldt_len = new_ldt->ldt_len;
 		} else
 			mdp->md_ldt = pldt = new_ldt;
 #ifdef SMP
 		/*
 		 * Signal other cpus to reload ldt.  We need to unlock dt_lock
 		 * here because other CPU will contest on it since their
 		 * curthreads won't hold the lock and will block when trying
 		 * to acquire it.
 		 */
 		mtx_unlock_spin(&dt_lock);
 		smp_rendezvous(NULL, set_user_ldt_rv, NULL,
 		    td->td_proc->p_vmspace);
 #else
 		set_user_ldt_locked(&td->td_proc->p_md);
 		mtx_unlock_spin(&dt_lock);
 #endif
 		if (old_ldt_base != NULL_LDT_BASE) {
-			kmem_free(kernel_arena, (vm_offset_t)old_ldt_base,
-			    old_ldt_len * sizeof(union descriptor));
+			pmap_trm_free(old_ldt_base, old_ldt_len *
+			    sizeof(union descriptor));
 			free(new_ldt, M_SUBPROC);
 		}
 		mtx_lock_spin(&dt_lock);
 	}
 	return (0);
 }
Index: head/sys/i386/i386/trap.c
===================================================================
--- head/sys/i386/i386/trap.c	(revision 332488)
+++ head/sys/i386/i386/trap.c	(revision 332489)
@@ -1,1106 +1,1127 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_clock.h"
+#include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_stack.h"
 #include "opt_trap.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , page_fault, all);
 PMC_SOFT_DEFINE( , , page_fault, read);
 PMC_SOFT_DEFINE( , , page_fault, write);
 #endif
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/stack.h>
 #include <machine/trap.h>
 #include <machine/tss.h>
 #include <machine/vm86.h>
 
 #ifdef POWERFAIL_NMI
 #include <sys/syslog.h>
 #include <machine/clock.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 #endif
 
 void trap(struct trapframe *frame);
 void syscall(struct trapframe *frame);
 
 static int trap_pfault(struct trapframe *, int, vm_offset_t);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 void dblfault_handler(void);
 
-extern inthand_t IDTVEC(lcall_syscall);
-
 #define MAX_TRAP_MSG		32
-static char *trap_msg[] = {
-	"",					/*  0 unused */
-	"privileged instruction fault",		/*  1 T_PRIVINFLT */
-	"",					/*  2 unused */
-	"breakpoint instruction fault",		/*  3 T_BPTFLT */
-	"",					/*  4 unused */
-	"",					/*  5 unused */
-	"arithmetic trap",			/*  6 T_ARITHTRAP */
-	"",					/*  7 unused */
-	"",					/*  8 unused */
-	"general protection fault",		/*  9 T_PROTFLT */
-	"trace trap",				/* 10 T_TRCTRAP */
-	"",					/* 11 unused */
-	"page fault",				/* 12 T_PAGEFLT */
-	"",					/* 13 unused */
-	"alignment fault",			/* 14 T_ALIGNFLT */
-	"",					/* 15 unused */
-	"",					/* 16 unused */
-	"",					/* 17 unused */
-	"integer divide fault",			/* 18 T_DIVIDE */
-	"non-maskable interrupt trap",		/* 19 T_NMI */
-	"overflow trap",			/* 20 T_OFLOW */
-	"FPU bounds check fault",		/* 21 T_BOUND */
-	"FPU device not available",		/* 22 T_DNA */
-	"double fault",				/* 23 T_DOUBLEFLT */
-	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
-	"invalid TSS fault",			/* 25 T_TSSFLT */
-	"segment not present fault",		/* 26 T_SEGNPFLT */
-	"stack fault",				/* 27 T_STKFLT */
-	"machine check trap",			/* 28 T_MCHK */
-	"SIMD floating-point exception",	/* 29 T_XMMFLT */
-	"reserved (unknown) fault",		/* 30 T_RESERVED */
-	"",					/* 31 unused (reserved) */
-	"DTrace pid return trap",               /* 32 T_DTRACE_RET */
+
+struct trap_data {
+	bool		ei;
+	const char	*msg;
 };
 
+static const struct trap_data trap_data[] = {
+	[T_PRIVINFLT] =	{ .ei = true,	.msg = "privileged instruction fault" },
+	[T_BPTFLT] =	{ .ei = false,	.msg = "breakpoint instruction fault" },
+	[T_ARITHTRAP] =	{ .ei = true,	.msg = "arithmetic trap" },
+	[T_PROTFLT] =	{ .ei = true,	.msg = "general protection fault" },
+	[T_TRCTRAP] =	{ .ei = false,	.msg = "trace trap" },
+	[T_PAGEFLT] =	{ .ei = true,	.msg = "page fault" },
+	[T_ALIGNFLT] = 	{ .ei = true,	.msg = "alignment fault" },
+	[T_DIVIDE] =	{ .ei = true,	.msg = "integer divide fault" },
+	[T_NMI] =	{ .ei = false,	.msg = "non-maskable interrupt trap" },
+	[T_OFLOW] =	{ .ei = true,	.msg = "overflow trap" },
+	[T_BOUND] =	{ .ei = true,	.msg = "FPU bounds check fault" },
+	[T_DNA] =	{ .ei = true,	.msg = "FPU device not available" },
+	[T_DOUBLEFLT] =	{ .ei = false,	.msg = "double fault" },
+	[T_FPOPFLT] =	{ .ei = true,	.msg = "FPU operand fetch fault" },
+	[T_TSSFLT] =	{ .ei = true,	.msg = "invalid TSS fault" },
+	[T_SEGNPFLT] =	{ .ei = true,	.msg = "segment not present fault" },
+	[T_STKFLT] =	{ .ei = true,	.msg = "stack fault" },
+	[T_MCHK] =	{ .ei = true,	.msg = "machine check trap" },
+	[T_XMMFLT] =	{ .ei = true,	.msg = "SIMD floating-point exception" },
+	[T_DTRACE_RET] ={ .ei = true,	.msg = "DTrace pid return trap" },
+};
+
+static bool
+trap_enable_intr(int trapno)
+{
+
+	MPASS(trapno > 0);
+	if (trapno < nitems(trap_data) && trap_data[trapno].msg != NULL)
+		return (trap_data[trapno].ei);
+	return (false);
+}
+
+static const char *
+trap_msg(int trapno)
+{
+	const char *res;
+	static const char unkn[] = "UNKNOWN";
+
+	res = NULL;
+	if (trapno < nitems(trap_data))
+		res = trap_data[trapno].msg;
+	if (res == NULL)
+		res = unkn;
+	return (res);
+}
+
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 int has_f00f_bug = 0;		/* Initialized so that it can be patched. */
 #endif
 
 static int prot_fault_translation = 0;
 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
 	&prot_fault_translation, 0, "Select signal to deliver on protection fault");
 static int uprintf_signal;
 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
     &uprintf_signal, 0,
     "Print debugging information on trap signal to ctty");
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(struct trapframe *frame)
 {
 	ksiginfo_t ksi;
 	struct thread *td;
 	struct proc *p;
 #ifdef KDB
 	register_t dr6;
 #endif
 	int signo, ucode;
 	u_int type;
 	register_t addr;
 	vm_offset_t eva;
 #ifdef POWERFAIL_NMI
 	static int lastalert = 0;
 #endif
 
 	td = curthread;
 	p = td->td_proc;
 	signo = 0;
 	ucode = 0;
 	addr = 0;
 
 	VM_CNT_INC(v_trap);
 	type = frame->tf_trapno;
 
+	KASSERT((read_eflags() & PSL_I) == 0,
+	    ("trap: interrupts enaabled, type %d frame %p", type, frame));
+
 #ifdef SMP
 	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI && ipi_nmi_handler() == 0)
 		return;
 #endif /* SMP */
 
 #ifdef KDB
 	if (kdb_active) {
 		kdb_reenter();
 		return;
 	}
 #endif
 
 	if (type == T_RESERVED) {
 		trap_fatal(frame, 0);
 		return;
 	}
 
 	if (type == T_NMI) {
 #ifdef HWPMC_HOOKS
 		/*
 		 * CPU PMCs interrupt using an NMI so we check for that first.
 		 * If the HWPMC module is active, 'pmc_hook' will point to
 		 * the function to be called.  A non-zero return value from the
 		 * hook means that the NMI was consumed by it and that we can
 		 * return immediately.
 		 */
 		if (pmc_intr != NULL &&
 		    (*pmc_intr)(PCPU_GET(cpuid), frame) != 0)
 			return;
 #endif
 
 #ifdef STACK
 		if (stack_nmi_handler(frame) != 0)
 			return;
 #endif
 	}
 
 	if (type == T_MCHK) {
 		mca_intr();
 		return;
 	}
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * A trap can occur while DTrace executes a probe. Before
 	 * executing the probe, DTrace blocks re-scheduling and sets
 	 * a flag in its per-cpu flags to indicate that it doesn't
 	 * want to fault. On returning from the probe, the no-fault
 	 * flag is cleared and finally re-scheduling is enabled.
 	 */
 	if ((type == T_PROTFLT || type == T_PAGEFLT) &&
 	    dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
 		return;
 #endif
 
-	if ((frame->tf_eflags & PSL_I) == 0) {
-		/*
-		 * Buggy application or kernel code has disabled
-		 * interrupts and then trapped.  Enabling interrupts
-		 * now is wrong, but it is better than running with
-		 * interrupts disabled until they are accidentally
-		 * enabled later.
-		 */
-		if (TRAPF_USERMODE(frame) &&
-		    (curpcb->pcb_flags & PCB_VM86CALL) == 0)
-			uprintf(
-			    "pid %ld (%s): trap %d with interrupts disabled\n",
-			    (long)curproc->p_pid, curthread->td_name, type);
-		else if (type != T_NMI && type != T_BPTFLT &&
-		    type != T_TRCTRAP &&
-		    frame->tf_eip != (int)cpu_switch_load_gs) {
-			/*
-			 * XXX not quite right, since this may be for a
-			 * multiple fault in user mode.
-			 */
-			printf("kernel trap %d with interrupts disabled\n",
-			    type);
-			/*
-			 * Page faults need interrupts disabled until later,
-			 * and we shouldn't enable interrupts while holding
-			 * a spin lock.
-			 */
-			if (type != T_PAGEFLT &&
-			    td->td_md.md_spinlock_count == 0)
-				enable_intr();
-		}
-	}
-	eva = 0;
-	if (type == T_PAGEFLT) {
-		/*
-		 * For some Cyrix CPUs, %cr2 is clobbered by
-		 * interrupts.  This problem is worked around by using
-		 * an interrupt gate for the pagefault handler.  We
-		 * are finally ready to read %cr2 and conditionally
-		 * reenable interrupts.  If we hold a spin lock, then
-		 * we must not reenable interrupts.  This might be a
-		 * spurious page fault.
-		 */
+	/*
+	 * We must not allow context switches until %cr2 is read.
+	 * Also, for some Cyrix CPUs, %cr2 is clobbered by interrupts.
+	 * All faults use interrupt gates, so %cr2 can be safely read
+	 * now, before optional enable of the interrupts below.
+	 */
+	if (type == T_PAGEFLT)
 		eva = rcr2();
-		if (td->td_md.md_spinlock_count == 0)
-			enable_intr();
-	}
 
+	/*
+	 * Buggy application or kernel code has disabled interrupts
+	 * and then trapped.  Enabling interrupts now is wrong, but it
+	 * is better than running with interrupts disabled until they
+	 * are accidentally enabled later.
+	 */
+	if ((frame->tf_eflags & PSL_I) == 0 && TRAPF_USERMODE(frame) &&
+	    (curpcb->pcb_flags & PCB_VM86CALL) == 0)
+		uprintf("pid %ld (%s): trap %d with interrupts disabled\n",
+		    (long)curproc->p_pid, curthread->td_name, type);
+
+	/*
+	 * Conditionally reenable interrupts.  If we hold a spin lock,
+	 * then we must not reenable interrupts.  This might be a
+	 * spurious page fault.
+	 */
+	if (trap_enable_intr(type) && td->td_md.md_spinlock_count == 0 &&
+	    frame->tf_eip != (int)cpu_switch_load_gs)
+		enable_intr();
+
         if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) {
 		/* user trap */
 
 		td->td_pticks = 0;
 		td->td_frame = frame;
 		addr = frame->tf_eip;
 		if (td->td_cowgen != p->p_cowgen)
 			thread_cow_update(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			signo = SIGILL;
 			ucode = ILL_PRVOPC;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			enable_intr();
 #ifdef KDTRACE_HOOKS
 			if (type == T_BPTFLT) {
 				if (dtrace_pid_probe_ptr != NULL &&
 				    dtrace_pid_probe_ptr(frame) == 0)
 					return;
 			}
 #endif
 user_trctrap_out:
 			frame->tf_eflags &= ~PSL_T;
 			signo = SIGTRAP;
 			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = npxtrap_x87();
 			if (ucode == -1)
 				return;
 			signo = SIGFPE;
 			break;
 
 		/*
 		 * The following two traps can happen in vm86 mode,
 		 * and, if so, we want to handle them specially.
 		 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame->tf_eflags & PSL_VM) {
 				signo = vm86_emulate((struct vm86frame *)frame);
 				if (signo == SIGTRAP) {
 					type = T_TRCTRAP;
 					load_dr6(rdr6() | 0x4000);
 					goto user_trctrap_out;
 				}
 				if (signo == 0)
 					goto user;
 				break;
 			}
 			signo = SIGBUS;
 			ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
 			break;
 		case T_SEGNPFLT:	/* segment not present fault */
 			signo = SIGBUS;
 			ucode = BUS_ADRERR;
 			break;
 		case T_TSSFLT:		/* invalid TSS fault */
 			signo = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_ALIGNFLT:
 			signo = SIGBUS;
 			ucode = BUS_ADRALN;
 			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			signo = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			signo = trap_pfault(frame, TRUE, eva);
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 			if (signo == -2) {
 				/*
 				 * The f00f hack workaround has triggered, so
 				 * treat the fault as an illegal instruction 
 				 * (T_PRIVINFLT) instead of a page fault.
 				 */
 				type = frame->tf_trapno = T_PRIVINFLT;
 
 				/* Proceed as in that case. */
 				ucode = ILL_PRVOPC;
 				signo = SIGILL;
 				break;
 			}
 #endif
 			if (signo == -1)
 				return;
 			if (signo == 0)
 				goto user;
 
 			if (signo == SIGSEGV)
 				ucode = SEGV_MAPERR;
 			else if (prot_fault_translation == 0) {
 				/*
 				 * Autodetect.  This check also covers
 				 * the images without the ABI-tag ELF
 				 * note.
 				 */
 				if (SV_CURPROC_ABI() == SV_ABI_FREEBSD &&
 				    p->p_osrel >= P_OSREL_SIGSEGV) {
 					signo = SIGSEGV;
 					ucode = SEGV_ACCERR;
 				} else {
 					signo = SIGBUS;
 					ucode = T_PAGEFLT;
 				}
 			} else if (prot_fault_translation == 1) {
 				/*
 				 * Always compat mode.
 				 */
 				signo = SIGBUS;
 				ucode = T_PAGEFLT;
 			} else {
 				/*
 				 * Always SIGSEGV mode.
 				 */
 				signo = SIGSEGV;
 				ucode = SEGV_ACCERR;
 			}
 			addr = eva;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			signo = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(880, hz);
 				lastalert = time_second;
 			}
 			return;
 #else /* !POWERFAIL_NMI */
 			nmi_handle_intr(type, frame);
 			return;
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			signo = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			signo = SIGFPE;
 			break;
 
 		case T_DNA:
 			KASSERT(PCB_USER_FPU(td->td_pcb),
 			    ("kernel FPU ctx has leaked"));
 			/* transparent fault (due to context switch "late") */
 			if (npxdna())
 				return;
 			uprintf("pid %d killed due to lack of floating point\n",
 				p->p_pid);
 			signo = SIGKILL;
 			ucode = 0;
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = ILL_COPROC;
 			signo = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 			ucode = npxtrap_sse();
 			if (ucode == -1)
 				return;
 			signo = SIGFPE;
 			break;
 #ifdef KDTRACE_HOOKS
 		case T_DTRACE_RET:
 			enable_intr();
 			if (dtrace_return_probe_ptr != NULL)
 				dtrace_return_probe_ptr(frame);
 			return;
 #endif
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(frame, FALSE, eva);
 			return;
 
 		case T_DNA:
 			if (PCB_USER_FPU(td->td_pcb))
 				panic("Unregistered use of FPU in kernel");
 			if (npxdna())
 				return;
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 		case T_XMMFLT:		/* SIMD floating-point exception */
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			/*
 			 * XXXKIB for now disable any FPU traps in kernel
 			 * handler registration seems to be overkill
 			 */
 			trap_fatal(frame, 0);
 			return;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame->tf_eflags & PSL_VM) {
 				signo = vm86_emulate((struct vm86frame *)frame);
 				if (signo == SIGTRAP) {
 					type = T_TRCTRAP;
 					load_dr6(rdr6() | 0x4000);
 					goto kernel_trctrap;
 				}
 				if (signo != 0)
 					/*
 					 * returns to original process
 					 */
 					vm86_trap((struct vm86frame *)frame);
 				return;
 			}
 			/* FALL THROUGH */
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (curpcb->pcb_flags & PCB_VM86CALL)
 				break;
 
 			/*
 			 * Invalid %fs's and %gs's can be created using
 			 * procfs or PT_SETREGS or by invalidating the
 			 * underlying LDT entry.  This causes a fault
 			 * in kernel mode when the kernel attempts to
 			 * switch contexts.  Lose the bad context
 			 * (XXX) so that we can continue, and generate
 			 * a signal.
 			 */
 			if (frame->tf_eip == (int)cpu_switch_load_gs) {
 				curpcb->pcb_gs = 0;
 #if 0				
 				PROC_LOCK(p);
 				kern_psignal(p, SIGBUS);
 				PROC_UNLOCK(p);
 #endif				
 				return;
 			}
 
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
+			 *
+			 * N.B. Comparing to long mode, 32-bit mode
+			 * does not push %esp on the trap frame,
+			 * because iretl faulted while in ring 0.  As
+			 * the consequence, there is no need to fixup
+			 * the stack pointer for doreti_iret_fault,
+			 * the fixup and the complimentary trap() call
+			 * are executed on the main thread stack, not
+			 * on the trampoline stack.
 			 */
-			if (frame->tf_eip == (int)doreti_iret) {
-				frame->tf_eip = (int)doreti_iret_fault;
+			if (frame->tf_eip == (int)doreti_iret + setidt_disp) {
+				frame->tf_eip = (int)doreti_iret_fault +
+				    setidt_disp;
 				return;
 			}
 			if (type == T_STKFLT)
 				break;
 
-			if (frame->tf_eip == (int)doreti_popl_ds) {
-				frame->tf_eip = (int)doreti_popl_ds_fault;
+			if (frame->tf_eip == (int)doreti_popl_ds +
+			    setidt_disp) {
+				frame->tf_eip = (int)doreti_popl_ds_fault +
+				    setidt_disp;
 				return;
 			}
-			if (frame->tf_eip == (int)doreti_popl_es) {
-				frame->tf_eip = (int)doreti_popl_es_fault;
+			if (frame->tf_eip == (int)doreti_popl_es +
+			    setidt_disp) {
+				frame->tf_eip = (int)doreti_popl_es_fault +
+				    setidt_disp;
 				return;
 			}
-			if (frame->tf_eip == (int)doreti_popl_fs) {
-				frame->tf_eip = (int)doreti_popl_fs_fault;
+			if (frame->tf_eip == (int)doreti_popl_fs +
+			    setidt_disp) {
+				frame->tf_eip = (int)doreti_popl_fs_fault +
+				    setidt_disp;
 				return;
 			}
 			if (curpcb->pcb_onfault != NULL) {
 				frame->tf_eip = (int)curpcb->pcb_onfault;
 				return;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame->tf_eflags & PSL_NT) {
 				frame->tf_eflags &= ~PSL_NT;
 				return;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 kernel_trctrap:
-			if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
-				/*
-				 * We've just entered system mode via the
-				 * syscall lcall.  Continue single stepping
-				 * silently until the syscall handler has
-				 * saved the flags.
-				 */
-				return;
-			}
-			if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
-				/*
-				 * The syscall handler has now saved the
-				 * flags.  Stop single stepping it.
-				 */
-				frame->tf_eflags &= ~PSL_T;
-				return;
-			}
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			if (user_dbreg_trap() && 
 			   !(curpcb->pcb_flags & PCB_VM86CALL)) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				load_dr6(rdr6() & ~0xf);
 				return;
 			}
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If KDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef KDB
 			/* XXX %dr6 is not quite reentrant. */
 			dr6 = rdr6();
 			load_dr6(dr6 & ~0x4000);
 			if (kdb_trap(type, dr6, frame))
 				return;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(880, hz);
 				lastalert = time_second;
 			}
 			return;
 #else /* !POWERFAIL_NMI */
 			nmi_handle_intr(type, frame);
 			return;
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 		}
 
 		trap_fatal(frame, eva);
 		return;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap != NULL)
 		signo = (*p->p_sysent->sv_transtrap)(signo, type);
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = signo;
 	ksi.ksi_code = ucode;
 	ksi.ksi_addr = (void *)addr;
 	ksi.ksi_trapno = type;
 	if (uprintf_signal) {
 		uprintf("pid %d comm %s: signal %d err %x code %d type %d "
-		    "addr 0x%x esp 0x%08x eip 0x%08x "
+		    "addr 0x%x ss 0x%04x esp 0x%08x cs 0x%04x eip 0x%08x "
 		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
 		    p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
-		    addr, frame->tf_esp, frame->tf_eip,
+		    addr, frame->tf_ss, frame->tf_esp, frame->tf_cs,
+		    frame->tf_eip,
 		    fubyte((void *)(frame->tf_eip + 0)),
 		    fubyte((void *)(frame->tf_eip + 1)),
 		    fubyte((void *)(frame->tf_eip + 2)),
 		    fubyte((void *)(frame->tf_eip + 3)),
 		    fubyte((void *)(frame->tf_eip + 4)),
 		    fubyte((void *)(frame->tf_eip + 5)),
 		    fubyte((void *)(frame->tf_eip + 6)),
 		    fubyte((void *)(frame->tf_eip + 7)));
 	}
 	KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
 	trapsignal(td, &ksi);
 
 user:
 	userret(td, frame);
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("Return from trap with kernel FPU ctx leaked"));
 }
 
 static int
 trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva)
 {
 	struct thread *td;
 	struct proc *p;
 	vm_offset_t va;
 	vm_map_t map;
 	int rv;
 	vm_prot_t ftype;
 
 	td = curthread;
 	p = td->td_proc;
 
 	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
 		/*
 		 * Due to both processor errata and lazy TLB invalidation when
 		 * access restrictions are removed from virtual pages, memory
 		 * accesses that are allowed by the physical mapping layer may
 		 * nonetheless cause one spurious page fault per virtual page. 
 		 * When the thread is executing a "no faulting" section that
 		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
 		 * every page fault is treated as a spurious page fault,
 		 * unless it accesses the same virtual address as the most
 		 * recent page fault within the same "no faulting" section.
 		 */
 		if (td->td_md.md_spurflt_addr != eva ||
 		    (td->td_pflags & TDP_RESETSPUR) != 0) {
 			/*
 			 * Do nothing to the TLB.  A stale TLB entry is
 			 * flushed automatically by a page fault.
 			 */
 			td->td_md.md_spurflt_addr = eva;
 			td->td_pflags &= ~TDP_RESETSPUR;
 			return (0);
 		}
 	} else {
 		/*
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 *
 		 * If we get a page fault while holding a non-sleepable
 		 * lock, then it is most likely a fatal kernel page fault.
 		 * If WITNESS is enabled, then it's going to whine about
 		 * bogus LORs with various VM locks, so just skip to the
 		 * fatal trap handling directly.
 		 */
 		if (td->td_critnest != 0 ||
 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
 		    "Kernel page fault") != 0) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 	va = trunc_page(eva);
-	if (va >= KERNBASE) {
+	if (va >= PMAP_TRM_MIN_ADDRESS) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 * An exception:  if the faulting address is the invalid
 		 * instruction entry in the IDT, then the Intel Pentium
 		 * F00F bug workaround was triggered, and we need to
 		 * treat it is as an illegal instruction, and not a page
 		 * fault.
 		 */
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
 			return (-2);
 #endif
 		if (usermode)
 			return (SIGSEGV);
-
-		map = kernel_map;
+		trap_fatal(frame, eva);
+		return (-1);
 	} else {
-		map = &p->p_vmspace->vm_map;
+		map = usermode ? &p->p_vmspace->vm_map : kernel_map;
 
 		/*
-		 * When accessing a user-space address, kernel must be
-		 * ready to accept the page fault, and provide a
-		 * handling routine.  Since accessing the address
-		 * without the handler is a bug, do not try to handle
-		 * it normally, and panic immediately.
+		 * Kernel cannot access a user-space address directly
+		 * because user pages are not mapped.  Also, page
+		 * faults must not be caused during the interrupts.
 		 */
-		if (!usermode && (td->td_intr_nesting_level != 0 ||
-		    curpcb->pcb_onfault == NULL)) {
+		if (!usermode && td->td_intr_nesting_level != 0) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 
 	/*
 	 * If the trap was caused by errant bits in the PTE then panic.
 	 */
 	if (frame->tf_err & PGEX_RSV) {
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 #if defined(PAE) || defined(PAE_TABLES)
 	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
 		ftype = VM_PROT_EXECUTE;
 #endif
 	else
 		ftype = VM_PROT_READ;
 
 	/* Fault in the page. */
 	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	if (rv == KERN_SUCCESS) {
 #ifdef HWPMC_HOOKS
 		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
 			if (ftype == VM_PROT_READ)
 				PMC_SOFT_CALL_TF( , , page_fault, read,
 				    frame);
 			else
 				PMC_SOFT_CALL_TF( , , page_fault, write,
 				    frame);
 		}
 #endif
 		return (0);
 	}
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    curpcb->pcb_onfault != NULL) {
 			frame->tf_eip = (int)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, ss, esp;
 	u_int type;
 	struct soft_segment_descriptor softseg;
-	char *msg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
-	if (type <= MAX_TRAP_MSG)
-		msg = trap_msg[type];
-	else
-		msg = "UNKNOWN";
-	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
+	printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg(type),
 	    frame->tf_eflags & PSL_VM ? "vm86" :
 	    ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s%s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 #if defined(PAE) || defined(PAE_TABLES)
 			pg_nx != 0 ?
 			(code & PGEX_I ? " instruction" : " data") :
 #endif
 			"",
 			code & PGEX_RSV ? "reserved bits in PTE" :
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n",
 	       frame->tf_cs & 0xffff, frame->tf_eip);
         if (TF_HAS_STACKREGS(frame)) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_esp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (int)&frame->tf_esp;
 	}
 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= %d (%s)\n",
 	    curproc->p_pid, curthread->td_name);
 
 #ifdef KDB
 	if (debugger_on_panic || kdb_active) {
 		frame->tf_err = eva;	/* smuggle fault address to ddb */
 		if (kdb_trap(type, 0, frame)) {
 			frame->tf_err = code;	/* restore error code */
 			return;
 		}
 		frame->tf_err = code;		/* restore error code */
 	}
 #endif
 	printf("trap number		= %d\n", type);
-	if (type <= MAX_TRAP_MSG)
-		panic("%s", trap_msg[type]);
+	if (trap_msg(type) != NULL)
+		panic("%s", trap_msg(type));
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
-dblfault_handler()
+dblfault_handler(void)
 {
 #ifdef KDTRACE_HOOKS
 	if (dtrace_doubletrap_func != NULL)
 		(*dtrace_doubletrap_func)();
 #endif
 	printf("\nFatal double fault:\n");
-	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
-	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
-	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
+	printf("eip = 0x%x\n", PCPU_GET(common_tssp)->tss_eip);
+	printf("esp = 0x%x\n", PCPU_GET(common_tssp)->tss_esp);
+	printf("ebp = 0x%x\n", PCPU_GET(common_tssp)->tss_ebp);
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	panic("double fault");
 }
 
 int
 cpu_fetch_syscall_args(struct thread *td)
 {
 	struct proc *p;
 	struct trapframe *frame;
 	struct syscall_args *sa;
 	caddr_t params;
 	long tmp;
 	int error;
+#ifdef COMPAT_43
+	u_int32_t eip;
+	int cs;
+#endif
 
 	p = td->td_proc;
 	frame = td->td_frame;
 	sa = &td->td_sa;
 
-	params = (caddr_t)frame->tf_esp + sizeof(int);
+#ifdef COMPAT_43
+	if (__predict_false(frame->tf_cs == 7 && frame->tf_eip == 2)) {
+		/*
+		 * In lcall $7,$0 after int $0x80.  Convert the user
+		 * frame to what it would be for a direct int 0x80 instead
+		 * of lcall $7,$0, by popping the lcall return address.
+		 */
+		error = fueword32((void *)frame->tf_esp, &eip);
+		if (error == -1)
+			return (EFAULT);
+		cs = fuword16((void *)(frame->tf_esp + sizeof(u_int32_t)));
+		if (cs == -1)
+			return (EFAULT);
+
+		/*
+		 * Unwind in-kernel frame after all stack frame pieces
+		 * were successfully read.
+		 */
+		frame->tf_eip = eip;
+		frame->tf_cs = cs;
+		frame->tf_esp += 2 * sizeof(u_int32_t);
+		frame->tf_err = 7;	/* size of lcall $7,$0 */
+	}
+#endif
+
 	sa->code = frame->tf_eax;
+	params = (caddr_t)frame->tf_esp + sizeof(uint32_t);
 
 	/*
 	 * Need to check if this is a 32 bit or 64 bit syscall.
 	 */
 	if (sa->code == SYS_syscall) {
 		/*
 		 * Code is first argument, followed by actual args.
 		 */
 		error = fueword(params, &tmp);
 		if (error == -1)
 			return (EFAULT);
 		sa->code = tmp;
-		params += sizeof(int);
+		params += sizeof(uint32_t);
 	} else if (sa->code == SYS___syscall) {
 		/*
 		 * Like syscall, but code is a quad, so as to maintain
 		 * quad alignment for the rest of the arguments.
 		 */
 		error = fueword(params, &tmp);
 		if (error == -1)
 			return (EFAULT);
 		sa->code = tmp;
 		params += sizeof(quad_t);
 	}
 
  	if (p->p_sysent->sv_mask)
  		sa->code &= p->p_sysent->sv_mask;
  	if (sa->code >= p->p_sysent->sv_size)
  		sa->callp = &p->p_sysent->sv_table[0];
   	else
  		sa->callp = &p->p_sysent->sv_table[sa->code];
 	sa->narg = sa->callp->sy_narg;
 
 	if (params != NULL && sa->narg != 0)
 		error = copyin(params, (caddr_t)sa->args,
-		    (u_int)(sa->narg * sizeof(int)));
+		    (u_int)(sa->narg * sizeof(uint32_t)));
 	else
 		error = 0;
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_edx;
 	}
 		
 	return (error);
 }
 
 #include "../../kern/subr_syscall.c"
 
 /*
  * syscall - system call request C handler.  A system call is
  * essentially treated as a trap by reusing the frame layout.
  */
 void
 syscall(struct trapframe *frame)
 {
 	struct thread *td;
 	register_t orig_tf_eflags;
 	int error;
 	ksiginfo_t ksi;
 
 #ifdef DIAGNOSTIC
 	if (!(TRAPF_USERMODE(frame) &&
 	    (curpcb->pcb_flags & PCB_VM86CALL) == 0)) {
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 	orig_tf_eflags = frame->tf_eflags;
 
 	td = curthread;
 	td->td_frame = frame;
 
 	error = syscallenter(td);
 
 	/*
 	 * Traced syscall.
 	 */
 	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
 		frame->tf_eflags &= ~PSL_T;
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGTRAP;
 		ksi.ksi_code = TRAP_TRACE;
 		ksi.ksi_addr = (void *)frame->tf_eip;
 		trapsignal(td, &ksi);
 	}
 
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("System call %s returning with kernel FPU ctx leaked",
 	     syscallname(td->td_proc, td->td_sa.code)));
 	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
 	    ("System call %s returning with mangled pcb_save",
 	     syscallname(td->td_proc, td->td_sa.code)));
 
 	syscallret(td, error);
 }
Index: head/sys/i386/i386/vm86.c
===================================================================
--- head/sys/i386/i386/vm86.c	(revision 332488)
+++ head/sys/i386/i386/vm86.c	(revision 332489)
@@ -1,723 +1,782 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997 Jonathan Lemon
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/psl.h>
 #include <machine/specialreg.h>
 #include <machine/sysarch.h>
 
 extern int vm86pa;
 extern struct pcb *vm86pcb;
 
 static struct mtx vm86_lock;
 
 extern int vm86_bioscall(struct vm86frame *);
 extern void vm86_biosret(struct vm86frame *);
 
 void vm86_prepcall(struct vm86frame *);
 
 struct system_map {
 	int		type;
 	vm_offset_t	start;
 	vm_offset_t	end;
 };
 
 #define	HLT	0xf4
 #define	CLI	0xfa
 #define	STI	0xfb
 #define	PUSHF	0x9c
 #define	POPF	0x9d
 #define	INTn	0xcd
 #define	IRET	0xcf
 #define	CALLm	0xff
 #define OPERAND_SIZE_PREFIX	0x66
 #define ADDRESS_SIZE_PREFIX	0x67
 #define PUSH_MASK	~(PSL_VM | PSL_RF | PSL_I)
 #define POP_MASK	~(PSL_VIP | PSL_VIF | PSL_VM | PSL_RF | PSL_IOPL)
 
+static int
+vm86_suword16(volatile void *base, int word)
+{
+
+	if (curthread->td_critnest != 0) {
+		*(volatile uint16_t *)base = word;
+		return (0);
+	}
+	return (suword16(base, word));
+}
+
+static int
+vm86_suword(volatile void *base, long word)
+{
+
+	if (curthread->td_critnest != 0) {
+		*(volatile long *)base = word;
+		return (0);
+	}
+	return (suword(base, word));
+}
+
+static int
+vm86_fubyte(volatile const void *base)
+{
+
+	if (curthread->td_critnest != 0)
+		return (*(volatile const u_char *)base);
+	return (fubyte(base));
+}
+
+static int
+vm86_fuword16(volatile const void *base)
+{
+
+	if (curthread->td_critnest != 0)
+		return (*(volatile const uint16_t *)base);
+	return (fuword16(base));
+}
+
+static long
+vm86_fuword(volatile const void *base)
+{
+
+	if (curthread->td_critnest != 0)
+		return (*(volatile const long *)base);
+	return (fuword(base));
+}
+
 static __inline caddr_t
 MAKE_ADDR(u_short sel, u_short off)
 {
 	return ((caddr_t)((sel << 4) + off));
 }
 
 static __inline void
 GET_VEC(u_int vec, u_short *sel, u_short *off)
 {
 	*sel = vec >> 16;
 	*off = vec & 0xffff;
 }
 
 static __inline u_int
 MAKE_VEC(u_short sel, u_short off)
 {
 	return ((sel << 16) | off);
 }
 
 static __inline void
 PUSH(u_short x, struct vm86frame *vmf)
 {
 	vmf->vmf_sp -= 2;
-	suword16(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp), x);
+	vm86_suword16(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp), x);
 }
 
 static __inline void
 PUSHL(u_int x, struct vm86frame *vmf)
 {
 	vmf->vmf_sp -= 4;
-	suword(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp), x);
+	vm86_suword(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp), x);
 }
 
 static __inline u_short
 POP(struct vm86frame *vmf)
 {
-	u_short x = fuword16(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp));
+	u_short x = vm86_fuword16(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp));
 
 	vmf->vmf_sp += 2;
 	return (x);
 }
 
 static __inline u_int
 POPL(struct vm86frame *vmf)
 {
-	u_int x = fuword(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp));
+	u_int x = vm86_fuword(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp));
 
 	vmf->vmf_sp += 4;
 	return (x);
 }
 
 int
 vm86_emulate(struct vm86frame *vmf)
 {
 	struct vm86_kernel *vm86;
 	caddr_t addr;
 	u_char i_byte;
 	u_int temp_flags;
 	int inc_ip = 1;
 	int retcode = 0;
 
 	/*
 	 * pcb_ext contains the address of the extension area, or zero if
 	 * the extension is not present.  (This check should not be needed,
 	 * as we can't enter vm86 mode until we set up an extension area)
 	 */
 	if (curpcb->pcb_ext == 0)
 		return (SIGBUS);
 	vm86 = &curpcb->pcb_ext->ext_vm86;
 
 	if (vmf->vmf_eflags & PSL_T)
 		retcode = SIGTRAP;
 
 	addr = MAKE_ADDR(vmf->vmf_cs, vmf->vmf_ip);
-	i_byte = fubyte(addr);
+	i_byte = vm86_fubyte(addr);
 	if (i_byte == ADDRESS_SIZE_PREFIX) {
-		i_byte = fubyte(++addr);
+		i_byte = vm86_fubyte(++addr);
 		inc_ip++;
 	}
 
 	if (vm86->vm86_has_vme) {
 		switch (i_byte) {
 		case OPERAND_SIZE_PREFIX:
-			i_byte = fubyte(++addr);
+			i_byte = vm86_fubyte(++addr);
 			inc_ip++;
 			switch (i_byte) {
 			case PUSHF:
 				if (vmf->vmf_eflags & PSL_VIF)
 					PUSHL((vmf->vmf_eflags & PUSH_MASK)
 					    | PSL_IOPL | PSL_I, vmf);
 				else
 					PUSHL((vmf->vmf_eflags & PUSH_MASK)
 					    | PSL_IOPL, vmf);
 				vmf->vmf_ip += inc_ip;
 				return (retcode);
 
 			case POPF:
 				temp_flags = POPL(vmf) & POP_MASK;
 				vmf->vmf_eflags = (vmf->vmf_eflags & ~POP_MASK)
 				    | temp_flags | PSL_VM | PSL_I;
 				vmf->vmf_ip += inc_ip;
 				if (temp_flags & PSL_I) {
 					vmf->vmf_eflags |= PSL_VIF;
 					if (vmf->vmf_eflags & PSL_VIP)
 						break;
 				} else {
 					vmf->vmf_eflags &= ~PSL_VIF;
 				}
 				return (retcode);
 			}
 			break;
 
 		/* VME faults here if VIP is set, but does not set VIF. */
 		case STI:
 			vmf->vmf_eflags |= PSL_VIF;
 			vmf->vmf_ip += inc_ip;
 			if ((vmf->vmf_eflags & PSL_VIP) == 0) {
 				uprintf("fatal sti\n");
 				return (SIGKILL);
 			}
 			break;
 
 		/* VME if no redirection support */
 		case INTn:
 			break;
 
 		/* VME if trying to set PSL_T, or PSL_I when VIP is set */
 		case POPF:
 			temp_flags = POP(vmf) & POP_MASK;
 			vmf->vmf_flags = (vmf->vmf_flags & ~POP_MASK)
 			    | temp_flags | PSL_VM | PSL_I;
 			vmf->vmf_ip += inc_ip;
 			if (temp_flags & PSL_I) {
 				vmf->vmf_eflags |= PSL_VIF;
 				if (vmf->vmf_eflags & PSL_VIP)
 					break;
 			} else {
 				vmf->vmf_eflags &= ~PSL_VIF;
 			}
 			return (retcode);
 
 		/* VME if trying to set PSL_T, or PSL_I when VIP is set */
 		case IRET:
 			vmf->vmf_ip = POP(vmf);
 			vmf->vmf_cs = POP(vmf);
 			temp_flags = POP(vmf) & POP_MASK;
 			vmf->vmf_flags = (vmf->vmf_flags & ~POP_MASK)
 			    | temp_flags | PSL_VM | PSL_I;
 			if (temp_flags & PSL_I) {
 				vmf->vmf_eflags |= PSL_VIF;
 				if (vmf->vmf_eflags & PSL_VIP)
 					break;
 			} else {
 				vmf->vmf_eflags &= ~PSL_VIF;
 			}
 			return (retcode);
 
 		}
 		return (SIGBUS);
 	}
 
 	switch (i_byte) {
 	case OPERAND_SIZE_PREFIX:
-		i_byte = fubyte(++addr);
+		i_byte = vm86_fubyte(++addr);
 		inc_ip++;
 		switch (i_byte) {
 		case PUSHF:
 			if (vm86->vm86_eflags & PSL_VIF)
 				PUSHL((vmf->vmf_flags & PUSH_MASK)
 				    | PSL_IOPL | PSL_I, vmf);
 			else
 				PUSHL((vmf->vmf_flags & PUSH_MASK)
 				    | PSL_IOPL, vmf);
 			vmf->vmf_ip += inc_ip;
 			return (retcode);
 
 		case POPF:
 			temp_flags = POPL(vmf) & POP_MASK;
 			vmf->vmf_eflags = (vmf->vmf_eflags & ~POP_MASK)
 			    | temp_flags | PSL_VM | PSL_I;
 			vmf->vmf_ip += inc_ip;
 			if (temp_flags & PSL_I) {
 				vm86->vm86_eflags |= PSL_VIF;
 				if (vm86->vm86_eflags & PSL_VIP)
 					break;
 			} else {
 				vm86->vm86_eflags &= ~PSL_VIF;
 			}
 			return (retcode);
 		}
 		return (SIGBUS);
 
 	case CLI:
 		vm86->vm86_eflags &= ~PSL_VIF;
 		vmf->vmf_ip += inc_ip;
 		return (retcode);
 
 	case STI:
 		/* if there is a pending interrupt, go to the emulator */
 		vm86->vm86_eflags |= PSL_VIF;
 		vmf->vmf_ip += inc_ip;
 		if (vm86->vm86_eflags & PSL_VIP)
 			break;
 		return (retcode);
 
 	case PUSHF:
 		if (vm86->vm86_eflags & PSL_VIF)
 			PUSH((vmf->vmf_flags & PUSH_MASK)
 			    | PSL_IOPL | PSL_I, vmf);
 		else
 			PUSH((vmf->vmf_flags & PUSH_MASK) | PSL_IOPL, vmf);
 		vmf->vmf_ip += inc_ip;
 		return (retcode);
 
 	case INTn:
-		i_byte = fubyte(addr + 1);
+		i_byte = vm86_fubyte(addr + 1);
 		if ((vm86->vm86_intmap[i_byte >> 3] & (1 << (i_byte & 7))) != 0)
 			break;
 		if (vm86->vm86_eflags & PSL_VIF)
 			PUSH((vmf->vmf_flags & PUSH_MASK)
 			    | PSL_IOPL | PSL_I, vmf);
 		else
 			PUSH((vmf->vmf_flags & PUSH_MASK) | PSL_IOPL, vmf);
 		PUSH(vmf->vmf_cs, vmf);
 		PUSH(vmf->vmf_ip + inc_ip + 1, vmf);	/* increment IP */
-		GET_VEC(fuword((caddr_t)(i_byte * 4)),
+		GET_VEC(vm86_fuword((caddr_t)(i_byte * 4)),
 		     &vmf->vmf_cs, &vmf->vmf_ip);
 		vmf->vmf_flags &= ~PSL_T;
 		vm86->vm86_eflags &= ~PSL_VIF;
 		return (retcode);
 
 	case IRET:
 		vmf->vmf_ip = POP(vmf);
 		vmf->vmf_cs = POP(vmf);
 		temp_flags = POP(vmf) & POP_MASK;
 		vmf->vmf_flags = (vmf->vmf_flags & ~POP_MASK)
 		    | temp_flags | PSL_VM | PSL_I;
 		if (temp_flags & PSL_I) {
 			vm86->vm86_eflags |= PSL_VIF;
 			if (vm86->vm86_eflags & PSL_VIP)
 				break;
 		} else {
 			vm86->vm86_eflags &= ~PSL_VIF;
 		}
 		return (retcode);
 
 	case POPF:
 		temp_flags = POP(vmf) & POP_MASK;
 		vmf->vmf_flags = (vmf->vmf_flags & ~POP_MASK)
 		    | temp_flags | PSL_VM | PSL_I;
 		vmf->vmf_ip += inc_ip;
 		if (temp_flags & PSL_I) {
 			vm86->vm86_eflags |= PSL_VIF;
 			if (vm86->vm86_eflags & PSL_VIP)
 				break;
 		} else {
 			vm86->vm86_eflags &= ~PSL_VIF;
 		}
 		return (retcode);
 	}
 	return (SIGBUS);
 }
 
 #define PGTABLE_SIZE	((1024 + 64) * 1024 / PAGE_SIZE)
 #define INTMAP_SIZE	32
 #define IOMAP_SIZE	ctob(IOPAGES)
 #define TSS_SIZE \
 	(sizeof(struct pcb_ext) - sizeof(struct segment_descriptor) + \
 	 INTMAP_SIZE + IOMAP_SIZE + 1)
 
 struct vm86_layout {
 	pt_entry_t	vml_pgtbl[PGTABLE_SIZE];
 	struct 	pcb vml_pcb;
 	struct	pcb_ext vml_ext;
 	char	vml_intmap[INTMAP_SIZE];
 	char	vml_iomap[IOMAP_SIZE];
 	char	vml_iomap_trailer;
 };
 
 void
 vm86_initialize(void)
 {
 	int i;
 	u_int *addr;
 	struct vm86_layout *vml = (struct vm86_layout *)vm86paddr;
 	struct pcb *pcb;
 	struct pcb_ext *ext;
 	struct soft_segment_descriptor ssd = {
 		0,			/* segment base address (overwritten) */
 		0,			/* length (overwritten) */
 		SDT_SYS386TSS,		/* segment type */
 		0,			/* priority level */
 		1,			/* descriptor present */
 		0, 0,
 		0,			/* default 16 size */
 		0			/* granularity */
 	};
 
 	/*
 	 * this should be a compile time error, but cpp doesn't grok sizeof().
 	 */
 	if (sizeof(struct vm86_layout) > ctob(3))
 		panic("struct vm86_layout exceeds space allocated in locore.s");
 
 	/*
 	 * Below is the memory layout that we use for the vm86 region.
 	 *
 	 * +--------+
 	 * |        | 
 	 * |        |
 	 * | page 0 |       
 	 * |        | +--------+
 	 * |        | | stack  |
 	 * +--------+ +--------+ <--------- vm86paddr
 	 * |        | |Page Tbl| 1M + 64K = 272 entries = 1088 bytes
 	 * |        | +--------+
 	 * |        | |  PCB   | size: ~240 bytes
 	 * | page 1 | |PCB Ext | size: ~140 bytes (includes TSS)
 	 * |        | +--------+
 	 * |        | |int map |
 	 * |        | +--------+
 	 * +--------+ |        |
 	 * | page 2 | |  I/O   |
 	 * +--------+ | bitmap |
 	 * | page 3 | |        |
 	 * |        | +--------+
 	 * +--------+ 
 	 */
 
 	/*
 	 * A rudimentary PCB must be installed, in order to get to the
 	 * PCB extension area.  We use the PCB area as a scratchpad for
 	 * data storage, the layout of which is shown below.
 	 *
 	 * pcb_esi	= new PTD entry 0
 	 * pcb_ebp	= pointer to frame on vm86 stack
 	 * pcb_esp	=    stack frame pointer at time of switch
 	 * pcb_ebx	= va of vm86 page table
 	 * pcb_eip	=    argument pointer to initial call
 	 * pcb_vm86[0]	=    saved TSS descriptor, word 0
 	 * pcb_vm86[1]	=    saved TSS descriptor, word 1
 	 */
 #define new_ptd		pcb_esi
 #define vm86_frame	pcb_ebp
 #define pgtable_va	pcb_ebx
 
 	pcb = &vml->vml_pcb;
 	ext = &vml->vml_ext;
 
 	mtx_init(&vm86_lock, "vm86 lock", NULL, MTX_DEF);
 
 	bzero(pcb, sizeof(struct pcb));
 	pcb->new_ptd = vm86pa | PG_V | PG_RW | PG_U;
 	pcb->vm86_frame = vm86paddr - sizeof(struct vm86frame);
 	pcb->pgtable_va = vm86paddr;
 	pcb->pcb_flags = PCB_VM86CALL; 
 	pcb->pcb_ext = ext;
 
 	bzero(ext, sizeof(struct pcb_ext)); 
 	ext->ext_tss.tss_esp0 = vm86paddr;
 	ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
 	ext->ext_tss.tss_ioopt = 
 		((u_int)vml->vml_iomap - (u_int)&ext->ext_tss) << 16;
 	ext->ext_iomap = vml->vml_iomap;
 	ext->ext_vm86.vm86_intmap = vml->vml_intmap;
 
 	if (cpu_feature & CPUID_VME)
 		ext->ext_vm86.vm86_has_vme = (rcr4() & CR4_VME ? 1 : 0);
 
 	addr = (u_int *)ext->ext_vm86.vm86_intmap;
 	for (i = 0; i < (INTMAP_SIZE + IOMAP_SIZE) / sizeof(u_int); i++)
 		*addr++ = 0;
 	vml->vml_iomap_trailer = 0xff;
 
 	ssd.ssd_base = (u_int)&ext->ext_tss;
 	ssd.ssd_limit = TSS_SIZE - 1; 
 	ssdtosd(&ssd, &ext->ext_tssd);
 
 	vm86pcb = pcb;
 
 #if 0
         /*
          * use whatever is leftover of the vm86 page layout as a
          * message buffer so we can capture early output.
          */
         msgbufinit((vm_offset_t)vm86paddr + sizeof(struct vm86_layout),
             ctob(3) - sizeof(struct vm86_layout));
 #endif
 }
 
 vm_offset_t
 vm86_getpage(struct vm86context *vmc, int pagenum)
 {
 	int i;
 
 	for (i = 0; i < vmc->npages; i++)
 		if (vmc->pmap[i].pte_num == pagenum)
 			return (vmc->pmap[i].kva);
 	return (0);
 }
 
 vm_offset_t
 vm86_addpage(struct vm86context *vmc, int pagenum, vm_offset_t kva)
 {
 	int i, flags = 0;
 
 	for (i = 0; i < vmc->npages; i++)
 		if (vmc->pmap[i].pte_num == pagenum)
 			goto overlap;
 
 	if (vmc->npages == VM86_PMAPSIZE)
 		goto full;			/* XXX grow map? */
 
 	if (kva == 0) {
 		kva = (vm_offset_t)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
 		flags = VMAP_MALLOC;
 	}
 
 	i = vmc->npages++;
 	vmc->pmap[i].flags = flags;
 	vmc->pmap[i].kva = kva;
 	vmc->pmap[i].pte_num = pagenum;
 	return (kva);
 overlap:
 	panic("vm86_addpage: overlap");
 full:
 	panic("vm86_addpage: not enough room");
 }
 
 /*
  * called from vm86_bioscall, while in vm86 address space, to finalize setup.
  */
 void
 vm86_prepcall(struct vm86frame *vmf)
 {
 	struct vm86_kernel *vm86;
 	uint32_t *stack;
 	uint8_t *code;
 
 	code = (void *)0xa00;
 	stack = (void *)(0x1000 - 2);	/* keep aligned */
 	if ((vmf->vmf_trapno & PAGE_MASK) <= 0xff) {
 		/* interrupt call requested */
 		code[0] = INTn;
 		code[1] = vmf->vmf_trapno & 0xff;
 		code[2] = HLT;
 		vmf->vmf_ip = (uintptr_t)code;
 		vmf->vmf_cs = 0;
 	} else {
 		code[0] = HLT;
 		stack--;
 		stack[0] = MAKE_VEC(0, (uintptr_t)code);
 	}
 	vmf->vmf_sp = (uintptr_t)stack;
 	vmf->vmf_ss = 0;
 	vmf->kernel_fs = vmf->kernel_es = vmf->kernel_ds = 0;
 	vmf->vmf_eflags = PSL_VIF | PSL_VM | PSL_USER;
 
 	vm86 = &curpcb->pcb_ext->ext_vm86;
 	if (!vm86->vm86_has_vme) 
 		vm86->vm86_eflags = vmf->vmf_eflags;  /* save VIF, VIP */
 }
 
 /*
  * vm86 trap handler; determines whether routine succeeded or not.
  * Called while in vm86 space, returns to calling process.
  */
 void
 vm86_trap(struct vm86frame *vmf)
 {
+	void (*p)(struct vm86frame *);
 	caddr_t addr;
 
 	/* "should not happen" */
 	if ((vmf->vmf_eflags & PSL_VM) == 0)
 		panic("vm86_trap called, but not in vm86 mode");
 
 	addr = MAKE_ADDR(vmf->vmf_cs, vmf->vmf_ip);
 	if (*(u_char *)addr == HLT)
 		vmf->vmf_trapno = vmf->vmf_eflags & PSL_C;
 	else
 		vmf->vmf_trapno = vmf->vmf_trapno << 16;
 
-	vm86_biosret(vmf);
+	p = (void (*)(struct vm86frame *))((uintptr_t)vm86_biosret +
+	    setidt_disp);
+	p(vmf);
 }
 
 int
 vm86_intcall(int intnum, struct vm86frame *vmf)
 {
+	int (*p)(struct vm86frame *);
 	int retval;
 
 	if (intnum < 0 || intnum > 0xff)
 		return (EINVAL);
 
 	vmf->vmf_trapno = intnum;
+	p = (int (*)(struct vm86frame *))((uintptr_t)vm86_bioscall +
+	    setidt_disp);
 	mtx_lock(&vm86_lock);
 	critical_enter();
-	retval = vm86_bioscall(vmf);
+	retval = p(vmf);
 	critical_exit();
 	mtx_unlock(&vm86_lock);
 	return (retval);
 }
 
 /*
  * struct vm86context contains the page table to use when making
  * vm86 calls.  If intnum is a valid interrupt number (0-255), then
  * the "interrupt trampoline" will be used, otherwise we use the
  * caller's cs:ip routine.  
  */
 int
 vm86_datacall(int intnum, struct vm86frame *vmf, struct vm86context *vmc)
 {
-	pt_entry_t *pte = (pt_entry_t *)vm86paddr;
+	pt_entry_t *pte;
+	int (*p)(struct vm86frame *);
 	vm_paddr_t page;
 	int i, entry, retval;
 
+	pte = (pt_entry_t *)vm86paddr;
 	mtx_lock(&vm86_lock);
 	for (i = 0; i < vmc->npages; i++) {
 		page = vtophys(vmc->pmap[i].kva & PG_FRAME);
 		entry = vmc->pmap[i].pte_num; 
 		vmc->pmap[i].old_pte = pte[entry];
 		pte[entry] = page | PG_V | PG_RW | PG_U;
 		pmap_invalidate_page(kernel_pmap, vmc->pmap[i].kva);
 	}
 
 	vmf->vmf_trapno = intnum;
+	p = (int (*)(struct vm86frame *))((uintptr_t)vm86_bioscall +
+	    setidt_disp);
 	critical_enter();
-	retval = vm86_bioscall(vmf);
+	retval = p(vmf);
 	critical_exit();
 
 	for (i = 0; i < vmc->npages; i++) {
 		entry = vmc->pmap[i].pte_num;
 		pte[entry] = vmc->pmap[i].old_pte;
 		pmap_invalidate_page(kernel_pmap, vmc->pmap[i].kva);
 	}
 	mtx_unlock(&vm86_lock);
 
 	return (retval);
 }
 
 vm_offset_t
 vm86_getaddr(struct vm86context *vmc, u_short sel, u_short off)
 {
 	int i, page;
 	vm_offset_t addr;
 
 	addr = (vm_offset_t)MAKE_ADDR(sel, off);
 	page = addr >> PAGE_SHIFT;
 	for (i = 0; i < vmc->npages; i++)
 		if (page == vmc->pmap[i].pte_num)
 			return (vmc->pmap[i].kva + (addr & PAGE_MASK));
 	return (0);
 }
 
 int
 vm86_getptr(struct vm86context *vmc, vm_offset_t kva, u_short *sel,
      u_short *off)
 {
 	int i;
 
 	for (i = 0; i < vmc->npages; i++)
 		if (kva >= vmc->pmap[i].kva &&
 		    kva < vmc->pmap[i].kva + PAGE_SIZE) {
 			*off = kva - vmc->pmap[i].kva;
 			*sel = vmc->pmap[i].pte_num << 8;
 			return (1);
 		}
 	return (0);
 }
 	
 int
 vm86_sysarch(struct thread *td, char *args)
 {
 	int error = 0;
 	struct i386_vm86_args ua;
 	struct vm86_kernel *vm86;
 
 	if ((error = copyin(args, &ua, sizeof(struct i386_vm86_args))) != 0)
 		return (error);
 
 	if (td->td_pcb->pcb_ext == 0)
 		if ((error = i386_extend_pcb(td)) != 0)
 			return (error);
 	vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 	switch (ua.sub_op) {
 	case VM86_INIT: {
 		struct vm86_init_args sa;
 
 		if ((error = copyin(ua.sub_args, &sa, sizeof(sa))) != 0)
 			return (error);
 		if (cpu_feature & CPUID_VME)
 			vm86->vm86_has_vme = (rcr4() & CR4_VME ? 1 : 0);
 		else
 			vm86->vm86_has_vme = 0;
 		vm86->vm86_inited = 1;
 		vm86->vm86_debug = sa.debug;
 		bcopy(&sa.int_map, vm86->vm86_intmap, 32);
 		}
 		break;
 
 #if 0
 	case VM86_SET_VME: {
 		struct vm86_vme_args sa;
 	
 		if ((cpu_feature & CPUID_VME) == 0)
 			return (ENODEV);
 
 		if (error = copyin(ua.sub_args, &sa, sizeof(sa)))
 			return (error);
 		if (sa.state)
 			load_cr4(rcr4() | CR4_VME);
 		else
 			load_cr4(rcr4() & ~CR4_VME);
 		}
 		break;
 #endif
 
 	case VM86_GET_VME: {
 		struct vm86_vme_args sa;
 
 		sa.state = (rcr4() & CR4_VME ? 1 : 0);
         	error = copyout(&sa, ua.sub_args, sizeof(sa));
 		}
 		break;
 
 	case VM86_INTCALL: {
 		struct vm86_intcall_args sa;
 
 		if ((error = priv_check(td, PRIV_VM86_INTCALL)))
 			return (error);
 		if ((error = copyin(ua.sub_args, &sa, sizeof(sa))))
 			return (error);
 		if ((error = vm86_intcall(sa.intnum, &sa.vmf)))
 			return (error);
 		error = copyout(&sa, ua.sub_args, sizeof(sa));
 		}
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	return (error);
 }
Index: head/sys/i386/i386/vm86bios.s
===================================================================
--- head/sys/i386/i386/vm86bios.s	(revision 332488)
+++ head/sys/i386/i386/vm86bios.s	(revision 332489)
@@ -1,169 +1,169 @@
 /*-
  * Copyright (c) 1998 Jonathan Lemon
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <machine/asmacros.h>		/* miscellaneous asm macros */
 #include <machine/trap.h>
 
 #include "assym.inc"
 
 #define SCR_NEWPTD	PCB_ESI		/* readability macros */ 
 #define SCR_VMFRAME	PCB_EBP		/* see vm86.c for explanation */
 #define SCR_STACK	PCB_ESP
 #define SCR_PGTABLE	PCB_EBX
 #define SCR_ARGFRAME	PCB_EIP
 #define SCR_TSS0	PCB_VM86
 #define SCR_TSS1	(PCB_VM86+4)
 
 	.data
 	ALIGN_DATA
 
 	.globl	vm86pcb
 
 vm86pcb:		.long	0
 
 	.text
 
 /*
  * vm86_bioscall(struct trapframe_vm86 *vm86)
  */
 ENTRY(vm86_bioscall)
 	movl	vm86pcb,%edx		/* scratch data area */
 	movl	4(%esp),%eax
 	movl	%eax,SCR_ARGFRAME(%edx)	/* save argument pointer */
 	pushl	%ebx
 	pushl	%ebp
 	pushl	%esi
 	pushl	%edi
 	pushl	%gs
 
 	movl	PCPU(CURTHREAD),%ecx
 	cmpl	%ecx,PCPU(FPCURTHREAD)	/* do we need to save fp? */
 	jne	1f
 	pushl	%edx
 	movl	TD_PCB(%ecx),%ecx
 	pushl	PCB_SAVEFPU(%ecx)
 	call	npxsave
 	addl	$4,%esp
 	popl	%edx			/* recover our pcb */
 1:
 	movl	SCR_VMFRAME(%edx),%ebx	/* target frame location */
 	movl	%ebx,%edi		/* destination */
 	movl    SCR_ARGFRAME(%edx),%esi	/* source (set on entry) */
 	movl	$VM86_FRAMESIZE/4,%ecx	/* sizeof(struct vm86frame)/4 */
 	cld
 	rep
 	movsl				/* copy frame to new stack */
 
 	movl	PCPU(CURPCB),%eax
 	pushl	%eax			/* save curpcb */
 	movl	%edx,PCPU(CURPCB)	/* set curpcb to vm86pcb */
 
 	movl	PCPU(TSS_GDT),%ebx	/* entry in GDT */
 	movl	0(%ebx),%eax
 	movl	%eax,SCR_TSS0(%edx)	/* save first word */
 	movl	4(%ebx),%eax
 	andl    $~0x200, %eax		/* flip 386BSY -> 386TSS */
 	movl	%eax,SCR_TSS1(%edx)	/* save second word */
 
 	movl	PCB_EXT(%edx),%edi	/* vm86 tssd entry */
 	movl	0(%edi),%eax
 	movl	%eax,0(%ebx)
 	movl	4(%edi),%eax
 	movl	%eax,4(%ebx)
 	movl	$GPROC0_SEL*8,%esi	/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 
 	movl	%cr3,%eax
 	pushl	%eax			/* save address space */
-	movl	IdlePTD,%ecx
+	movl	IdlePTD,%ecx		/* va (and pa) of Idle PTD */
 	movl	%ecx,%ebx
-	addl	$KERNBASE,%ebx		/* va of Idle PTD */
 	movl	0(%ebx),%eax
 	pushl	%eax			/* old ptde != 0 when booting */
 	pushl	%ebx			/* keep for reuse */
 
 	movl	%esp,SCR_STACK(%edx)	/* save current stack location */
 
 	movl	SCR_NEWPTD(%edx),%eax	/* mapping for vm86 page table */
 	movl	%eax,0(%ebx)		/* ... install as PTD entry 0 */
 
 #if defined(PAE) || defined(PAE_TABLES)
 	movl	IdlePDPT,%ecx
 #endif
 	movl	%ecx,%cr3		/* new page tables */
 	movl	SCR_VMFRAME(%edx),%esp	/* switch to new stack */
 
 	pushl	%esp
-	call	vm86_prepcall		/* finish setup */
+	movl	$vm86_prepcall, %eax
+	call	*%eax			/* finish setup */
 	add	$4, %esp
 	
 	/*
 	 * Return via doreti
 	 */
 	MEXITCOUNT
 	jmp	doreti
 
 
 /*
  * vm86_biosret(struct trapframe_vm86 *vm86)
  */
 ENTRY(vm86_biosret)
 	movl	vm86pcb,%edx		/* data area */
 
 	movl	4(%esp),%esi		/* source */
 	movl	SCR_ARGFRAME(%edx),%edi	/* destination */
 	movl	$VM86_FRAMESIZE/4,%ecx	/* size */
 	cld
 	rep
 	movsl				/* copy frame to original frame */
 
 	movl	SCR_STACK(%edx),%esp	/* back to old stack */
 	popl	%ebx			/* saved va of Idle PTD */
 	popl	%eax
 	movl	%eax,0(%ebx)		/* restore old pte */
 	popl	%eax
 	movl	%eax,%cr3		/* install old page table */
 
 	movl	PCPU(TSS_GDT),%ebx		/* entry in GDT */
 	movl	SCR_TSS0(%edx),%eax
 	movl	%eax,0(%ebx)		/* restore first word */
 	movl	SCR_TSS1(%edx),%eax
 	movl	%eax,4(%ebx)		/* restore second word */
 	movl	$GPROC0_SEL*8,%esi	/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 	
 	popl	PCPU(CURPCB)		/* restore curpcb/curproc */
 	movl	SCR_ARGFRAME(%edx),%edx	/* original stack frame */
 	movl	TF_TRAPNO(%edx),%eax	/* return (trapno) */
 
 	popl	%gs
 	popl	%edi
 	popl	%esi
 	popl	%ebp
 	popl	%ebx
 	ret				/* back to our normal program */
Index: head/sys/i386/i386/vm_machdep.c
===================================================================
--- head/sys/i386/i386/vm_machdep.c	(revision 332488)
+++ head/sys/i386/i386/vm_machdep.c	(revision 332489)
@@ -1,699 +1,701 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1982, 1986 The Regents of the University of California.
  * Copyright (c) 1989, 1990 William Jolitz
  * Copyright (c) 1994 John Dyson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_isa.h"
 #include "opt_npx.h"
 #include "opt_reset.h"
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/smp.h>
 #include <machine/vm86.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 
 #ifndef NSFBUFS
 #define	NSFBUFS		(512 + maxusers * 16)
 #endif
 
 _Static_assert(OFFSETOF_CURTHREAD == offsetof(struct pcpu, pc_curthread),
     "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread.");
 _Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb),
     "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb.");
 _Static_assert(__OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
     "__OFFSETOF_MONINORBUF does not correspond with offset of pc_monitorbuf.");
 
 union savefpu *
 get_pcb_user_save_td(struct thread *td)
 {
 	vm_offset_t p;
 
 	p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
 	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN);
 	KASSERT((p % XSAVE_AREA_ALIGN) == 0, ("Unaligned pcb_user_save area"));
 	return ((union savefpu *)p);
 }
 
 union savefpu *
 get_pcb_user_save_pcb(struct pcb *pcb)
 {
 	vm_offset_t p;
 
 	p = (vm_offset_t)(pcb + 1);
 	return ((union savefpu *)p);
 }
 
 struct pcb *
 get_pcb_td(struct thread *td)
 {
 	vm_offset_t p;
 
 	p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
 	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN) -
 	    sizeof(struct pcb);
 	return ((struct pcb *)p);
 }
 
 void *
 alloc_fpusave(int flags)
 {
 	void *res;
 	struct savefpu_ymm *sf;
 
 	res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags);
 	if (use_xsave) {
 		sf = (struct savefpu_ymm *)res;
 		bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd));
 		sf->sv_xstate.sx_hd.xstate_bv = xsave_mask;
 	}
 	return (res);
 }
 /*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
  * ready to run and return to user mode.
  */
 void
 cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
 {
 	struct proc *p1;
 	struct pcb *pcb2;
 	struct mdproc *mdp2;
 
 	p1 = td1->td_proc;
 	if ((flags & RFPROC) == 0) {
 		if ((flags & RFMEM) == 0) {
 			/* unshare user LDT */
 			struct mdproc *mdp1 = &p1->p_md;
 			struct proc_ldt *pldt, *pldt1;
 
 			mtx_lock_spin(&dt_lock);
 			if ((pldt1 = mdp1->md_ldt) != NULL &&
 			    pldt1->ldt_refcnt > 1) {
 				pldt = user_ldt_alloc(mdp1, pldt1->ldt_len);
 				if (pldt == NULL)
 					panic("could not copy LDT");
 				mdp1->md_ldt = pldt;
 				set_user_ldt(mdp1);
 				user_ldt_deref(pldt1);
 			} else
 				mtx_unlock_spin(&dt_lock);
 		}
 		return;
 	}
 
 	/* Ensure that td1's pcb is up to date. */
 	if (td1 == curthread)
 		td1->td_pcb->pcb_gs = rgs();
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == td1)
 		npxsave(td1->td_pcb->pcb_save);
 	critical_exit();
 
 	/* Point the pcb to the top of the stack */
 	pcb2 = get_pcb_td(td2);
 	td2->td_pcb = pcb2;
 
 	/* Copy td1's pcb */
 	bcopy(td1->td_pcb, pcb2, sizeof(*pcb2));
 
 	/* Properly initialize pcb_save */
 	pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
 	bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2),
 	    cpu_max_ext_state_size);
 
 	/* Point mdproc and then copy over td1's contents */
 	mdp2 = &p2->p_md;
 	bcopy(&p1->p_md, mdp2, sizeof(*mdp2));
 
 	/*
 	 * Create a new fresh stack for the new process.
 	 * Copy the trap frame for the return to user mode as if from a
 	 * syscall.  This copies most of the user mode register values.
-	 * The -16 is so we can expand the trapframe if we go to vm86.
+	 * The -VM86_STACK_SPACE (-16) is so we can expand the trapframe
+	 * if we go to vm86.
 	 */
-	td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb - 16) - 1;
+	td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb -
+	    VM86_STACK_SPACE) - 1;
 	bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe));
 
 	td2->td_frame->tf_eax = 0;		/* Child returns zero */
 	td2->td_frame->tf_eflags &= ~PSL_C;	/* success */
 	td2->td_frame->tf_edx = 1;
 
 	/*
 	 * If the parent process has the trap bit set (i.e. a debugger had
 	 * single stepped the process to the system call), we need to clear
 	 * the trap flag from the new frame unless the debugger had set PF_FORK
 	 * on the parent.  Otherwise, the child will receive a (likely
 	 * unexpected) SIGTRAP when it executes the first instruction after
 	 * returning  to userland.
 	 */
 	if ((p1->p_pfsflags & PF_FORK) == 0)
 		td2->td_frame->tf_eflags &= ~PSL_T;
 
 	/*
 	 * Set registers for trampoline to user mode.  Leave space for the
 	 * return address on stack.  These are the kernel mode register values.
 	 */
 #if defined(PAE) || defined(PAE_TABLES)
 	pcb2->pcb_cr3 = vtophys(vmspace_pmap(p2->p_vmspace)->pm_pdpt);
 #else
 	pcb2->pcb_cr3 = vtophys(vmspace_pmap(p2->p_vmspace)->pm_pdir);
 #endif
 	pcb2->pcb_edi = 0;
 	pcb2->pcb_esi = (int)fork_return;	/* fork_trampoline argument */
 	pcb2->pcb_ebp = 0;
 	pcb2->pcb_esp = (int)td2->td_frame - sizeof(void *);
 	pcb2->pcb_ebx = (int)td2;		/* fork_trampoline argument */
-	pcb2->pcb_eip = (int)fork_trampoline;
+	pcb2->pcb_eip = (int)fork_trampoline + setidt_disp;
 	/*-
 	 * pcb2->pcb_dr*:	cloned above.
 	 * pcb2->pcb_savefpu:	cloned above.
 	 * pcb2->pcb_flags:	cloned above.
 	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
 	 * pcb2->pcb_gs:	cloned above.
 	 * pcb2->pcb_ext:	cleared below.
 	 */
 
 	/*
 	 * XXX don't copy the i/o pages.  this should probably be fixed.
 	 */
 	pcb2->pcb_ext = 0;
 
 	/* Copy the LDT, if necessary. */
 	mtx_lock_spin(&dt_lock);
 	if (mdp2->md_ldt != NULL) {
 		if (flags & RFMEM) {
 			mdp2->md_ldt->ldt_refcnt++;
 		} else {
 			mdp2->md_ldt = user_ldt_alloc(mdp2,
 			    mdp2->md_ldt->ldt_len);
 			if (mdp2->md_ldt == NULL)
 				panic("could not copy LDT");
 		}
 	}
 	mtx_unlock_spin(&dt_lock);
 
 	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 
 	/*
 	 * Now, cpu_switch() can schedule the new process.
 	 * pcb_esp is loaded pointing to the cpu_switch() stack frame
 	 * containing the return address when exiting cpu_switch.
 	 * This will normally be to fork_trampoline(), which will have
 	 * %ebx loaded with the new proc's pointer.  fork_trampoline()
 	 * will set up a stack to call fork_return(p, frame); to complete
 	 * the return to user-mode.
 	 */
 }
 
 /*
  * Intercept the return address from a freshly forked process that has NOT
  * been scheduled yet.
  *
  * This is needed to make kernel threads stay in kernel mode.
  */
 void
 cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg)
 {
 	/*
 	 * Note that the trap frame follows the args, so the function
 	 * is really called like this:  func(arg, frame);
 	 */
 	td->td_pcb->pcb_esi = (int) func;	/* function */
 	td->td_pcb->pcb_ebx = (int) arg;	/* first arg */
 }
 
 void
 cpu_exit(struct thread *td)
 {
 
 	/*
 	 * If this process has a custom LDT, release it.  Reset pc->pcb_gs
 	 * and %gs before we free it in case they refer to an LDT entry.
 	 */
 	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt) {
 		td->td_pcb->pcb_gs = _udatasel;
 		load_gs(_udatasel);
 		user_ldt_free(td);
 	} else
 		mtx_unlock_spin(&dt_lock);
 }
 
 void
 cpu_thread_exit(struct thread *td)
 {
 
 	critical_enter();
 	if (td == PCPU_GET(fpcurthread))
 		npxdrop();
 	critical_exit();
 
 	/* Disable any hardware breakpoints. */
 	if (td->td_pcb->pcb_flags & PCB_DBREGS) {
 		reset_dbregs();
 		td->td_pcb->pcb_flags &= ~PCB_DBREGS;
 	}
 }
 
 void
 cpu_thread_clean(struct thread *td)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb; 
 	if (pcb->pcb_ext != NULL) {
 		/* if (pcb->pcb_ext->ext_refcount-- == 1) ?? */
 		/*
 		 * XXX do we need to move the TSS off the allocated pages
 		 * before freeing them?  (not done here)
 		 */
-		kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_ext,
-		    ctob(IOPAGES + 1));
+		pmap_trm_free(pcb->pcb_ext, ctob(IOPAGES + 1));
 		pcb->pcb_ext = NULL;
 	}
 }
 
 void
 cpu_thread_swapin(struct thread *td)
 {
 }
 
 void
 cpu_thread_swapout(struct thread *td)
 {
 }
 
 void
 cpu_thread_alloc(struct thread *td)
 {
 	struct pcb *pcb;
 	struct xstate_hdr *xhdr;
 
 	td->td_pcb = pcb = get_pcb_td(td);
-	td->td_frame = (struct trapframe *)((caddr_t)pcb - 16) - 1;
+	td->td_frame = (struct trapframe *)((caddr_t)pcb -
+	    VM86_STACK_SPACE) - 1;
 	pcb->pcb_ext = NULL; 
 	pcb->pcb_save = get_pcb_user_save_pcb(pcb);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1);
 		bzero(xhdr, sizeof(*xhdr));
 		xhdr->xstate_bv = xsave_mask;
 	}
 }
 
 void
 cpu_thread_free(struct thread *td)
 {
 
 	cpu_thread_clean(td);
 }
 
 void
 cpu_set_syscall_retval(struct thread *td, int error)
 {
 
 	switch (error) {
 	case 0:
 		td->td_frame->tf_eax = td->td_retval[0];
 		td->td_frame->tf_edx = td->td_retval[1];
 		td->td_frame->tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes, int
 		 * 0x80 is 2 bytes. We saved this in tf_err.
 		 */
 		td->td_frame->tf_eip -= td->td_frame->tf_err;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 		td->td_frame->tf_eax = SV_ABI_ERRNO(td->td_proc, error);
 		td->td_frame->tf_eflags |= PSL_C;
 		break;
 	}
 }
 
 /*
  * Initialize machine state, mostly pcb and trap frame for a new
  * thread, about to return to userspace.  Put enough state in the new
  * thread's PCB to get it to go back to the fork_return(), which
  * finalizes the thread state and handles peculiarities of the first
  * return to userspace for the new thread.
  */
 void
 cpu_copy_thread(struct thread *td, struct thread *td0)
 {
 	struct pcb *pcb2;
 
 	/* Point the pcb to the top of the stack. */
 	pcb2 = td->td_pcb;
 
 	/*
 	 * Copy the upcall pcb.  This loads kernel regs.
 	 * Those not loaded individually below get their default
 	 * values here.
 	 */
 	bcopy(td0->td_pcb, pcb2, sizeof(*pcb2));
 	pcb2->pcb_flags &= ~(PCB_NPXINITDONE | PCB_NPXUSERINITDONE |
 	    PCB_KERNNPX);
 	pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
 	bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save,
 	    cpu_max_ext_state_size);
 
 	/*
 	 * Create a new fresh stack for the new thread.
 	 */
 	bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe));
 
 	/* If the current thread has the trap bit set (i.e. a debugger had
 	 * single stepped the process to the system call), we need to clear
 	 * the trap flag from the new frame. Otherwise, the new thread will
 	 * receive a (likely unexpected) SIGTRAP when it executes the first
 	 * instruction after returning to userland.
 	 */
 	td->td_frame->tf_eflags &= ~PSL_T;
 
 	/*
 	 * Set registers for trampoline to user mode.  Leave space for the
 	 * return address on stack.  These are the kernel mode register values.
 	 */
 	pcb2->pcb_edi = 0;
 	pcb2->pcb_esi = (int)fork_return;		    /* trampoline arg */
 	pcb2->pcb_ebp = 0;
 	pcb2->pcb_esp = (int)td->td_frame - sizeof(void *); /* trampoline arg */
 	pcb2->pcb_ebx = (int)td;			    /* trampoline arg */
-	pcb2->pcb_eip = (int)fork_trampoline;
+	pcb2->pcb_eip = (int)fork_trampoline + setidt_disp;
 	pcb2->pcb_gs = rgs();
 	/*
 	 * If we didn't copy the pcb, we'd need to do the following registers:
 	 * pcb2->pcb_cr3:	cloned above.
 	 * pcb2->pcb_dr*:	cloned above.
 	 * pcb2->pcb_savefpu:	cloned above.
 	 * pcb2->pcb_flags:	cloned above.
 	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
 	 * pcb2->pcb_gs:	cloned above.
 	 * pcb2->pcb_ext:	cleared below.
 	 */
 	pcb2->pcb_ext = NULL;
 
 	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 }
 
 /*
  * Set that machine state for performing an upcall that starts
  * the entry function with the given argument.
  */
 void
 cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg,
     stack_t *stack)
 {
 
 	/* 
 	 * Do any extra cleaning that needs to be done.
 	 * The thread may have optional components
 	 * that are not present in a fresh thread.
 	 * This may be a recycled thread so make it look
 	 * as though it's newly allocated.
 	 */
 	cpu_thread_clean(td);
 
 	/*
 	 * Set the trap frame to point at the beginning of the entry
 	 * function.
 	 */
 	td->td_frame->tf_ebp = 0; 
 	td->td_frame->tf_esp =
 	    (((int)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4;
 	td->td_frame->tf_eip = (int)entry;
 
 	/* Return address sentinel value to stop stack unwinding. */
 	suword((void *)td->td_frame->tf_esp, 0);
 
 	/* Pass the argument to the entry point. */
 	suword((void *)(td->td_frame->tf_esp + sizeof(void *)),
 	    (int)arg);
 }
 
 int
 cpu_set_user_tls(struct thread *td, void *tls_base)
 {
 	struct segment_descriptor sd;
 	uint32_t base;
 
 	/*
 	 * Construct a descriptor and store it in the pcb for
 	 * the next context switch.  Also store it in the gdt
 	 * so that the load of tf_fs into %fs will activate it
 	 * at return to userland.
 	 */
 	base = (uint32_t)tls_base;
 	sd.sd_lobase = base & 0xffffff;
 	sd.sd_hibase = (base >> 24) & 0xff;
 	sd.sd_lolimit = 0xffff;	/* 4GB limit, wraps around */
 	sd.sd_hilimit = 0xf;
 	sd.sd_type  = SDT_MEMRWA;
 	sd.sd_dpl   = SEL_UPL;
 	sd.sd_p     = 1;
 	sd.sd_xx    = 0;
 	sd.sd_def32 = 1;
 	sd.sd_gran  = 1;
 	critical_enter();
 	/* set %gs */
 	td->td_pcb->pcb_gsd = sd;
 	if (td == curthread) {
 		PCPU_GET(fsgs_gdt)[1] = sd;
 		load_gs(GSEL(GUGS_SEL, SEL_UPL));
 	}
 	critical_exit();
 	return (0);
 }
 
 /*
  * Convert kernel VA to physical address
  */
 vm_paddr_t
 kvtop(void *addr)
 {
 	vm_paddr_t pa;
 
 	pa = pmap_kextract((vm_offset_t)addr);
 	if (pa == 0)
 		panic("kvtop: zero page frame");
 	return (pa);
 }
 
 /*
  * Get an sf_buf from the freelist.  May block if none are available.
  */
 void
 sf_buf_map(struct sf_buf *sf, int flags)
 {
 	pt_entry_t opte, *ptep;
 
 	/*
 	 * Update the sf_buf's virtual-to-physical mapping, flushing the
 	 * virtual address from the TLB.  Since the reference count for 
 	 * the sf_buf's old mapping was zero, that mapping is not 
 	 * currently in use.  Consequently, there is no need to exchange 
 	 * the old and new PTEs atomically, even under PAE.
 	 */
 	ptep = vtopte(sf->kva);
 	opte = *ptep;
-	*ptep = VM_PAGE_TO_PHYS(sf->m) | pgeflag | PG_RW | PG_V |
+	*ptep = VM_PAGE_TO_PHYS(sf->m) | PG_RW | PG_V |
 	    pmap_cache_bits(sf->m->md.pat_mode, 0);
 
 	/*
 	 * Avoid unnecessary TLB invalidations: If the sf_buf's old
 	 * virtual-to-physical mapping was not used, then any processor
 	 * that has invalidated the sf_buf's virtual address from its TLB
 	 * since the last used mapping need not invalidate again.
 	 */
 #ifdef SMP
 	if ((opte & (PG_V | PG_A)) ==  (PG_V | PG_A))
 		CPU_ZERO(&sf->cpumask);
 
 	sf_buf_shootdown(sf, flags);
 #else
 	if ((opte & (PG_V | PG_A)) ==  (PG_V | PG_A))
 		pmap_invalidate_page(kernel_pmap, sf->kva);
 #endif
 }
 
 #ifdef SMP
 void
 sf_buf_shootdown(struct sf_buf *sf, int flags)
 {
 	cpuset_t other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	if (!CPU_ISSET(cpuid, &sf->cpumask)) {
 		CPU_SET(cpuid, &sf->cpumask);
 		invlpg(sf->kva);
 	}
 	if ((flags & SFB_CPUPRIVATE) == 0) {
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
 		CPU_NAND(&other_cpus, &sf->cpumask);
 		if (!CPU_EMPTY(&other_cpus)) {
 			CPU_OR(&sf->cpumask, &other_cpus);
 			smp_masked_invlpg(other_cpus, sf->kva, kernel_pmap);
 		}
 	}
 	sched_unpin();
 }
 #endif
 
 /*
  * MD part of sf_buf_free().
  */
 int
 sf_buf_unmap(struct sf_buf *sf)
 {
 
 	return (0);
 }
 
 static void
 sf_buf_invalidate(struct sf_buf *sf)
 {
 	vm_page_t m = sf->m;
 
 	/*
 	 * Use pmap_qenter to update the pte for
 	 * existing mapping, in particular, the PAT
 	 * settings are recalculated.
 	 */
 	pmap_qenter(sf->kva, &m, 1);
 	pmap_invalidate_cache_range(sf->kva, sf->kva + PAGE_SIZE, FALSE);
 }
 
 /*
  * Invalidate the cache lines that may belong to the page, if
  * (possibly old) mapping of the page by sf buffer exists.  Returns
  * TRUE when mapping was found and cache invalidated.
  */
 boolean_t
 sf_buf_invalidate_cache(vm_page_t m)
 {
 
 	return (sf_buf_process_page(m, sf_buf_invalidate));
 }
 
 /*
  * Software interrupt handler for queued VM system processing.
  */   
 void  
 swi_vm(void *dummy) 
 {     
 	if (busdma_swi_pending != 0)
 		busdma_swi();
 }
 
 /*
  * Tell whether this address is in some physical memory region.
  * Currently used by the kernel coredump code in order to avoid
  * dumping the ``ISA memory hole'' which could cause indefinite hangs,
  * or other unpredictable behaviour.
  */
 
 int
 is_physical_memory(vm_paddr_t addr)
 {
 
 #ifdef DEV_ISA
 	/* The ISA ``memory hole''. */
 	if (addr >= 0xa0000 && addr < 0x100000)
 		return 0;
 #endif
 
 	/*
 	 * stuff other tests for known memory-mapped devices (PCI?)
 	 * here
 	 */
 
 	return 1;
 }
Index: head/sys/i386/include/asmacros.h
===================================================================
--- head/sys/i386/include/asmacros.h	(revision 332488)
+++ head/sys/i386/include/asmacros.h	(revision 332489)
@@ -1,205 +1,246 @@
+/* -*- mode: asm -*- */
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_ASMACROS_H_
 #define _MACHINE_ASMACROS_H_
 
 #include <sys/cdefs.h>
 
 /* XXX too much duplication in various asm*.h's. */
 
 /*
  * CNAME is used to manage the relationship between symbol names in C
  * and the equivalent assembly language names.  CNAME is given a name as
  * it would be used in a C program.  It expands to the equivalent assembly
  * language name.
  */
 #define CNAME(csym)		csym
 
 #define ALIGN_DATA	.p2align 2	/* 4 byte alignment, zero filled */
 #ifdef GPROF
 #define ALIGN_TEXT	.p2align 4,0x90	/* 16-byte alignment, nop filled */
 #else
 #define ALIGN_TEXT	.p2align 2,0x90	/* 4-byte alignment, nop filled */
 #endif
 #define SUPERALIGN_TEXT	.p2align 4,0x90	/* 16-byte alignment, nop filled */
 
 #define GEN_ENTRY(name)		ALIGN_TEXT; .globl CNAME(name); \
 				.type CNAME(name),@function; CNAME(name):
 #define NON_GPROF_ENTRY(name)	GEN_ENTRY(name)
 #define NON_GPROF_RET		.byte 0xc3	/* opcode for `ret' */
 
 #define	END(name)		.size name, . - name
 
 #ifdef GPROF
 /*
  * __mcount is like [.]mcount except that doesn't require its caller to set
  * up a frame pointer.  It must be called before pushing anything onto the
  * stack.  gcc should eventually generate code to call __mcount in most
  * cases.  This would make -pg in combination with -fomit-frame-pointer
  * useful.  gcc has a configuration variable PROFILE_BEFORE_PROLOGUE to
  * allow profiling before setting up the frame pointer, but this is
  * inadequate for good handling of special cases, e.g., -fpic works best
  * with profiling after the prologue.
  *
  * [.]mexitcount is a new function to support non-statistical profiling if an
  * accurate clock is available.  For C sources, calls to it are generated
  * by the FreeBSD extension `-mprofiler-epilogue' to gcc.  It is best to
  * call [.]mexitcount at the end of a function like the MEXITCOUNT macro does,
  * but gcc currently generates calls to it at the start of the epilogue to
  * avoid problems with -fpic.
  *
  * [.]mcount and __mcount may clobber the call-used registers and %ef.
  * [.]mexitcount may clobber %ecx and %ef.
  *
  * Cross-jumping makes non-statistical profiling timing more complicated.
  * It is handled in many cases by calling [.]mexitcount before jumping.  It
  * is handled for conditional jumps using CROSSJUMP() and CROSSJUMP_LABEL().
  * It is handled for some fault-handling jumps by not sharing the exit
  * routine.
  *
  * ALTENTRY() must be before a corresponding ENTRY() so that it can jump to
  * the main entry point.  Note that alt entries are counted twice.  They
  * have to be counted as ordinary entries for gprof to get the call times
  * right for the ordinary entries.
  *
  * High local labels are used in macros to avoid clashes with local labels
  * in functions.
  *
  * Ordinary `ret' is used instead of a macro `RET' because there are a lot
  * of `ret's.  0xc3 is the opcode for `ret' (`#define ret ... ret' can't
  * be used because this file is sometimes preprocessed in traditional mode).
  * `ret' clobbers eflags but this doesn't matter.
  */
 #define ALTENTRY(name)		GEN_ENTRY(name) ; MCOUNT ; MEXITCOUNT ; jmp 9f
 #define	CROSSJUMP(jtrue, label, jfalse) \
 	jfalse 8f; MEXITCOUNT; jmp __CONCAT(to,label); 8:
 #define CROSSJUMPTARGET(label) \
 	ALIGN_TEXT; __CONCAT(to,label): ; MCOUNT; jmp label
 #define ENTRY(name)		GEN_ENTRY(name) ; 9: ; MCOUNT
 #define FAKE_MCOUNT(caller)	pushl caller ; call __mcount ; popl %ecx
 #define MCOUNT			call __mcount
 #define MCOUNT_LABEL(name)	GEN_ENTRY(name) ; nop ; ALIGN_TEXT
 #ifdef GUPROF
 #define MEXITCOUNT		call .mexitcount
 #define ret			MEXITCOUNT ; NON_GPROF_RET
 #else
 #define MEXITCOUNT
 #endif
 
 #else /* !GPROF */
 /*
  * ALTENTRY() has to align because it is before a corresponding ENTRY().
  * ENTRY() has to align to because there may be no ALTENTRY() before it.
  * If there is a previous ALTENTRY() then the alignment code for ENTRY()
  * is empty.
  */
 #define ALTENTRY(name)		GEN_ENTRY(name)
 #define	CROSSJUMP(jtrue, label, jfalse)	jtrue label
 #define	CROSSJUMPTARGET(label)
 #define ENTRY(name)		GEN_ENTRY(name)
 #define FAKE_MCOUNT(caller)
 #define MCOUNT
 #define MCOUNT_LABEL(name)
 #define MEXITCOUNT
 #endif /* GPROF */
 
 #ifdef LOCORE
+
+#define	GSEL_KPL	0x0020	/* GSEL(GCODE_SEL, SEL_KPL) */
+#define	SEL_RPL_MASK	0x0003
+
 /*
  * Convenience macro for declaring interrupt entry points.
  */
 #define	IDTVEC(name)	ALIGN_TEXT; .globl __CONCAT(X,name); \
 			.type __CONCAT(X,name),@function; __CONCAT(X,name):
 
 /*
  * Macros to create and destroy a trap frame.
  */
-#define	PUSH_FRAME							\
-	pushl	$0 ;		/* dummy error code */			\
-	pushl	$0 ;		/* dummy trap type */			\
-	pushal ;		/* 8 ints */				\
-	pushl	$0 ;		/* save data and extra segments ... */	\
-	movw	%ds,(%esp) ;						\
-	pushl	$0 ;							\
-	movw	%es,(%esp) ;						\
-	pushl	$0 ;							\
+	.macro	PUSH_FRAME2
+	pushal
+	pushl	$0
+	movw	%ds,(%esp)
+	pushl	$0
+	movw	%es,(%esp)
+	pushl	$0
 	movw	%fs,(%esp)
+	.endm
+
+	.macro	PUSH_FRAME
+	pushl	$0		/* dummy error code */
+	pushl	$0		/* dummy trap type */
+	PUSH_FRAME2
+	.endm
 	
 /*
  * Access per-CPU data.
  */
 #define	PCPU(member)	%fs:PC_ ## member
 
 #define	PCPU_ADDR(member, reg)						\
 	movl %fs:PC_PRVSPACE, reg ;					\
 	addl $PC_ ## member, reg
 
 /*
  * Setup the kernel segment registers.
  */
-#define	SET_KERNEL_SREGS						\
-	movl	$KDSEL, %eax ;	/* reload with kernel's data segment */	\
-	movl	%eax, %ds ;						\
-	movl	%eax, %es ;						\
-	movl	$KPSEL, %eax ;	/* reload with per-CPU data segment */	\
+	.macro	SET_KERNEL_SREGS
+	movl	$KDSEL, %eax	/* reload with kernel's data segment */
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	$KPSEL, %eax	/* reload with per-CPU data segment */
 	movl	%eax, %fs
+	.endm
+
+	.macro	NMOVE_STACKS
+	movl	PCPU(KESP0), %edx
+	movl	$TF_SZ, %ecx
+	testl	$PSL_VM, TF_EFLAGS(%esp)
+	jz	1001f
+	addl	$(4*4), %ecx
+1001:	subl	%ecx, %edx
+	movl	%edx, %edi
+	movl	%esp, %esi
+	rep; movsb
+	movl	%edx, %esp
+	.endm
+
+	.macro	MOVE_STACKS
+	call	1000f
+1000:	popl	%eax
+	movl	(tramp_idleptd - 1000b)(%eax), %eax
+	movl	%eax, %cr3
+	NMOVE_STACKS
+	.endm
+
+	.macro	KENTER
+	testl	$PSL_VM, TF_EFLAGS(%esp)
+	jnz	2f
+	testb	$SEL_RPL_MASK, TF_CS(%esp)
+	jz	2f
+1:	MOVE_STACKS
+2:
+	.endm
 
 #endif /* LOCORE */
 
 #ifdef __STDC__
 #define ELFNOTE(name, type, desctype, descdata...) \
 .pushsection .note.name                 ;       \
   .align 4                              ;       \
   .long 2f - 1f         /* namesz */    ;       \
   .long 4f - 3f         /* descsz */    ;       \
   .long type                            ;       \
 1:.asciz #name                          ;       \
 2:.align 4                              ;       \
 3:desctype descdata                     ;       \
 4:.align 4                              ;       \
 .popsection
 #else /* !__STDC__, i.e. -traditional */
 #define ELFNOTE(name, type, desctype, descdata) \
 .pushsection .note.name                 ;       \
   .align 4                              ;       \
   .long 2f - 1f         /* namesz */    ;       \
   .long 4f - 3f         /* descsz */    ;       \
   .long type                            ;       \
 1:.asciz "name"                         ;       \
 2:.align 4                              ;       \
 3:desctype descdata                     ;       \
 4:.align 4                              ;       \
 .popsection
 #endif /* __STDC__ */
 
 #endif /* !_MACHINE_ASMACROS_H_ */
Index: head/sys/i386/include/frame.h
===================================================================
--- head/sys/i386/include/frame.h	(revision 332488)
+++ head/sys/i386/include/frame.h	(revision 332489)
@@ -1,44 +1,48 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _I386_FRAME_H_
 #define	_I386_FRAME_H_
 
 #include <x86/frame.h>
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
+#define	TRAMP_STACK_SZ		4096
+#define	TRAMP_COPYOUT_SZ	128
+#define	VM86_STACK_SPACE	16
+
 #endif	/* _I386_FRAME_H_ */
Index: head/sys/i386/include/md_var.h
===================================================================
--- head/sys/i386/include/md_var.h	(revision 332488)
+++ head/sys/i386/include/md_var.h	(revision 332489)
@@ -1,77 +1,82 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1995 Bruce D. Evans.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_MD_VAR_H_
 #define	_MACHINE_MD_VAR_H_
 
 #include <x86/x86_var.h>
 
 extern	u_int	cyrix_did;
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern	int	has_f00f_bug;
 #endif
 #ifdef COMPAT_FREEBSD4
 extern	int	szfreebsd4_sigcode;
 #endif
 #ifdef COMPAT_43
 extern	int	szosigcode;
+extern	int	sz_lcall_tramp;
 #endif
 extern	uint32_t *vm_page_dump;
+extern  vm_offset_t proc0kstack;
+extern	uintptr_t setidt_disp;
 
 struct	segment_descriptor;
 union savefpu;
 
 void	bcopyb(const void *from, void *to, size_t len);
 void	cpu_switch_load_gs(void) __asm(__STRING(cpu_switch_load_gs));
+void	copyout_init_tramp(void);
 void	doreti_iret(void) __asm(__STRING(doreti_iret));
 void	doreti_iret_fault(void) __asm(__STRING(doreti_iret_fault));
 void	doreti_popl_ds(void) __asm(__STRING(doreti_popl_ds));
 void	doreti_popl_ds_fault(void) __asm(__STRING(doreti_popl_ds_fault));
 void	doreti_popl_es(void) __asm(__STRING(doreti_popl_es));
 void	doreti_popl_es_fault(void) __asm(__STRING(doreti_popl_es_fault));
 void	doreti_popl_fs(void) __asm(__STRING(doreti_popl_fs));
 void	doreti_popl_fs_fault(void) __asm(__STRING(doreti_popl_fs_fault));
 void	fill_based_sd(struct segment_descriptor *sdp, uint32_t base);
 void	i686_pagezero(void *addr);
 void	sse2_pagezero(void *addr);
 void	init_AMD_Elan_sc520(void);
 vm_paddr_t kvtop(void *addr);
 void	panicifcpuunsupported(void);
 void	ppro_reenable_apic(void);
 void	set_fsbase(struct thread *td, uint32_t base);
 void	set_gsbase(struct thread *td, uint32_t base);
 void	setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int selec);
+void	setidt_nodisp(int idx, uintptr_t func, int typ, int dpl, int selec);
 union savefpu *get_pcb_user_save_td(struct thread *td);
 union savefpu *get_pcb_user_save_pcb(struct pcb *pcb);
 
 #endif /* !_MACHINE_MD_VAR_H_ */
Index: head/sys/i386/include/param.h
===================================================================
--- head/sys/i386/include/param.h	(revision 332488)
+++ head/sys/i386/include/param.h	(revision 332489)
@@ -1,170 +1,169 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)param.h	5.8 (Berkeley) 6/28/91
  * $FreeBSD$
  */
 
 
 #ifndef _I386_INCLUDE_PARAM_H_
 #define	_I386_INCLUDE_PARAM_H_
 
 #include <machine/_align.h>
 
 /*
  * Machine dependent constants for Intel 386.
  */
 
 
 #define __HAVE_ACPI
 #define	__HAVE_PIR
 #define __PCI_REROUTE_INTERRUPT
 
 #ifndef MACHINE
 #define MACHINE		"i386"
 #endif
 #ifndef MACHINE_ARCH
 #define	MACHINE_ARCH	"i386"
 #endif
 #define MID_MACHINE	MID_I386
 
 #if defined(SMP) || defined(KLD_MODULE)
 #ifndef MAXCPU
 #define MAXCPU		32
 #endif
 #else
 #define MAXCPU		1
 #endif /* SMP || KLD_MODULE */
 
 #ifndef MAXMEMDOM
 #define	MAXMEMDOM	1
 #endif
 
 #define ALIGNBYTES	_ALIGNBYTES
 #define ALIGN(p)	_ALIGN(p)
 /*
  * ALIGNED_POINTER is a boolean macro that checks whether an address
  * is valid to fetch data elements of type t from on this architecture.
  * This does not reflect the optimal alignment, just the possibility
  * (within reasonable limits). 
  */
 #define	ALIGNED_POINTER(p, t)	1
 
 /*
  * CACHE_LINE_SIZE is the compile-time maximum cache line size for an
  * architecture.  It should be used with appropriate caution.
  */
 #define	CACHE_LINE_SHIFT	6
 #define	CACHE_LINE_SIZE		(1 << CACHE_LINE_SHIFT)
 
 #define PAGE_SHIFT	12		/* LOG2(PAGE_SIZE) */
 #define PAGE_SIZE	(1<<PAGE_SHIFT)	/* bytes/page */
 #define PAGE_MASK	(PAGE_SIZE-1)
 #define NPTEPG		(PAGE_SIZE/(sizeof (pt_entry_t)))
 
 #if defined(PAE) || defined(PAE_TABLES)
 #define NPGPTD		4
 #define PDRSHIFT	21		/* LOG2(NBPDR) */
 #define NPGPTD_SHIFT	9
 #else
 #define NPGPTD		1
 #define PDRSHIFT	22		/* LOG2(NBPDR) */
 #define NPGPTD_SHIFT	10
 #endif
 
 #define NBPTD		(NPGPTD<<PAGE_SHIFT)
 #define NPDEPTD		(NBPTD/(sizeof (pd_entry_t)))
 #define NPDEPG		(PAGE_SIZE/(sizeof (pd_entry_t)))
 #define NBPDR		(1<<PDRSHIFT)	/* bytes/page dir */
 #define PDRMASK		(NBPDR-1)
 
 #define	MAXPAGESIZES	2	/* maximum number of supported page sizes */
 
 #define IOPAGES	2		/* pages of i/o permission bitmap */
 
 #ifndef KSTACK_PAGES
 #define KSTACK_PAGES 4		/* Includes pcb! */
 #endif
 #define KSTACK_GUARD_PAGES 1	/* pages of kstack guard; 0 disables */
 #if KSTACK_PAGES < 4
 #define	TD0_KSTACK_PAGES 4
 #else
 #define	TD0_KSTACK_PAGES KSTACK_PAGES
 #endif
 
 /*
  * Ceiling on amount of swblock kva space, can be changed via
  * the kern.maxswzone /boot/loader.conf variable.
  *
  * 276 is sizeof(struct swblock), but we do not always have a definition
  * in scope for struct swblock, so we have to hardcode it.  Each struct
  * swblock holds metadata for 32 pages, so in theory, this is enough for
  * 16 GB of swap.  In practice, however, the usable amount is considerably
  * lower due to fragmentation.
  */
 #ifndef VM_SWZONE_SIZE_MAX
 #define VM_SWZONE_SIZE_MAX	(276 * 128 * 1024)
 #endif
 
 /*
  * Ceiling on size of buffer cache (really only effects write queueing,
  * the VM page cache is not effected), can be changed via
  * the kern.maxbcache /boot/loader.conf variable.
  *
  * The value is equal to the size of the auto-tuned buffer map for
  * the machine with 4GB of RAM, see vfs_bio.c:kern_vfs_bio_buffer_alloc().
  */
 #ifndef VM_BCACHE_SIZE_MAX
 #define VM_BCACHE_SIZE_MAX	(7224 * 16 * 1024)
 #endif
 
 /*
  * Mach derived conversion macros
  */
 #define trunc_page(x)		((x) & ~PAGE_MASK)
 #define round_page(x)		(((x) + PAGE_MASK) & ~PAGE_MASK)
 #define trunc_4mpage(x)		((x) & ~PDRMASK)
 #define round_4mpage(x)		((((x)) + PDRMASK) & ~PDRMASK)
 
 #define atop(x)			((x) >> PAGE_SHIFT)
 #define ptoa(x)			((x) << PAGE_SHIFT)
 
 #define i386_btop(x)		((x) >> PAGE_SHIFT)
 #define i386_ptob(x)		((x) << PAGE_SHIFT)
 
 #define	pgtok(x)		((x) * (PAGE_SIZE / 1024))
 
-#define INKERNEL(va)	(((vm_offset_t)(va)) >= VM_MAXUSER_ADDRESS && \
-    ((vm_offset_t)(va)) < VM_MAX_KERNEL_ADDRESS)
+#define INKERNEL(va)		(TRUE)
 
 #endif /* !_I386_INCLUDE_PARAM_H_ */
Index: head/sys/i386/include/pc/bios.h
===================================================================
--- head/sys/i386/include/pc/bios.h	(revision 332488)
+++ head/sys/i386/include/pc/bios.h	(revision 332489)
@@ -1,355 +1,355 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997 Michael Smith
  * Copyright (c) 1998 Jonathan Lemon
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_PC_BIOS_H_
 #define _MACHINE_PC_BIOS_H_
 
 /* 
  * Signature structure for the BIOS32 Service Directory header 
  */
 struct bios32_SDheader 
 {
     u_int8_t	sig[4];
     u_int32_t	entry;
     u_int8_t	revision;
     u_int8_t	len;
     u_int8_t	cksum;
     u_int8_t	pad[5];
 };
 
 /* 
  * PnP BIOS presence structure
  */
 struct PnPBIOS_table 
 {
     u_int8_t	sig[4];			/* "$PnP */
     u_int8_t	version;		/* should be 0x10 */
     u_int8_t	len;    		/* total structure length */
     u_int16_t	control;		/* BIOS feature flags */
     u_int8_t	cksum;			/* checksum */
     u_int32_t	evflagaddr;		/* address of event notificaton flag */
     u_int16_t	rmentryoffset;		/* real-mode entry offset */
     u_int16_t	rmentryseg;		/*                 segment */
     u_int16_t	pmentryoffset;		/* protected-mode entry offset */
     u_int32_t	pmentrybase;		/*                segment base */
     u_int32_t	oemdevid;		/* motherboard EISA ID */
     u_int16_t	rmbiosseg;		/* real-mode BIOS segment */
     u_int32_t	pmdataseg;		/* protected-mode data segment */
 } __packed;
 
 /*
  * PnP BIOS return codes
  */
 #define PNP_SUCCESS				0x00
 #define PNP_NOT_SET_STATICALLY			0x7f
 #define PNP_UNKNOWN_FUNCTION			0x81
 #define PNP_FUNCTION_NOT_SUPPORTED		0x82
 #define PNP_INVALID_HANDLE			0x83
 #define PNP_BAD_PARAMETER			0x84
 #define PNP_SET_FAILED				0x85
 #define PNP_EVENTS_NOT_PENDING			0x86
 #define PNP_SYSTEM_NOT_DOCKED			0x87
 #define PNP_NO_ISA_PNP_CARDS			0x88
 #define PNP_UNABLE_TO_DETERMINE_DOCK_CAPABILITIES 0x89
 #define PNP_CONFIG_CHANGE_FAILED_NO_BATTERY	0x8a
 #define PNP_CONFIG_CHANGE_FAILED_RESOURCE_CONFLICT 0x8b
 #define PNP_BUFFER_TOO_SMALL			0x8c
 #define PNP_USE_ESCD_SUPPORT			0x8d
 #define PNP_MESSAGE_NOT_SUPPORTED		0x8e
 #define PNP_HARDWARE_ERROR			0x8f
 
 /*
  * DMI return codes
  */
 #define DMI_SUCCESS				0x00
 #define DMI_UNKNOWN_FUNCTION			0x81
 #define DMI_FUNCTION_NOT_SUPPORTED		0x82
 #define DMI_INVALID_HANDLE			0x83
 #define DMI_BAD_PARAMETER			0x84
 #define DMI_INVALID_SUBFUNCTION			0x85
 #define DMI_NO_CHANGE				0x86
 #define DMI_ADD_STRUCTURE_FAILED		0x87
 #define DMI_READ_ONLY				0x8d
 #define DMI_LOCK_NOT_SUPPORTED			0x90
 #define DMI_CURRENTLY_LOCKED			0x91
 #define DMI_INVALID_LOCK			0x92
 
 /*
  * format specifiers and defines for bios16()
  *     s	= short (16 bits)
  *     i	= int (32 bits)
  *     p	= pointer (converted to seg:offset)
  *     C,D,U 	= selector (corresponding to code/data/utility segment)
  */
 #define PNP_COUNT_DEVNODES	"sppD",		0x00
 #define PNP_GET_DEVNODE		"sppsD",	0x01
 #define PNP_SET_DEVNODE		"sspsD",	0x02
 #define PNP_GET_EVENT		"spD",		0x03
 #define PNP_SEND_MSG		"ssD",		0x04
 #define PNP_GET_DOCK_INFO	"spD",		0x05
 
 #define PNP_SEL_PRIBOOT		"ssiiisspD",	0x07
 #define PNP_GET_PRIBOOT		"sspppppD",	0x08
 #define PNP_SET_RESINFO		"spD",		0x09
 #define PNP_GET_RESINFO		"spD",		0x0A
 #define PNP_GET_APM_ID		"sppD",		0x0B
 
 #define PNP_GET_ISA_INFO	"spD",		0x40
 #define PNP_GET_ECSD_INFO	"spppD",	0x41
 #define PNP_READ_ESCD		"spUD",		0x42
 #define PNP_WRITE_ESCD		"spUD",		0x43
 
 #define PNP_GET_DMI_INFO	"spppppD",	0x50
 #define PNP_GET_DMI_STRUCTURE	"sppUD",	0x51
 #define PNP_SET_DMI_STRUCTURE	"sppsUD"	0x52
 #define PNP_GET_DMI_CHANGE	"spUD"		0x53
 #define PNP_DMI_CONTROL		"sspsUD"	0x54
 #define PNP_GET_GPNV_INFO	"sppppD"	0x55
 #define PNP_READ_GPNV_DATA	"ssppUD"	0x56
 #define PNP_WRITE_GPNV_DATA	"sspsUD"	0x57
 
 #define PNP_BOOT_CHECK		"sp",		0x60
 #define PNP_COUNT_IPL		"sppp",		0x61
 #define PNP_GET_BOOTPRI		"spp",		0x62
 #define PNP_SET_BOOTPRI		"sp",		0x63
 #define PNP_GET_LASTBOOT	"sp",		0x64
 #define PNP_GET_BOOTFIRST	"sp",		0x65
 #define PNP_SET_BOOTFIRST	"sp",		0x66
 
 /*
  * PCI BIOS functions
  */
 #define PCIBIOS_BIOS_PRESENT		0xb101
 #define PCIBIOS_READ_CONFIG_BYTE	0xb108
 #define PCIBIOS_READ_CONFIG_WORD	0xb109
 #define PCIBIOS_READ_CONFIG_DWORD	0xb10a
 #define PCIBIOS_WRITE_CONFIG_BYTE	0xb10b
 #define PCIBIOS_WRITE_CONFIG_WORD	0xb10c
 #define PCIBIOS_WRITE_CONFIG_DWORD	0xb10d
 #define PCIBIOS_GET_IRQ_ROUTING		0xb10e
 #define PCIBIOS_ROUTE_INTERRUPT		0xb10f
 
 /*
  * PCI interrupt routing table.
  *
  * $PIR in the BIOS segment contains a PIR_table
  * int 1a:b106 returns PIR_table in buffer at es:(e)di 
  * int 1a:b18e returns PIR_table in buffer at es:(e)di 
  * int 1a:b406 returns es:di pointing to the BIOS PIR_table
  */
 struct PIR_header 
 {
     int8_t	ph_signature[4];
     u_int16_t	ph_version;
     u_int16_t	ph_length;
     u_int8_t	ph_router_bus;
     u_int8_t	ph_router_dev_fn;
     u_int16_t	ph_pci_irqs;
     u_int16_t	ph_router_vendor;
     u_int16_t	ph_router_device;
     u_int32_t	ph_miniport;
     u_int8_t	ph_res[11];
     u_int8_t	ph_checksum;
 } __packed;
 
 struct PIR_intpin 
 {
     u_int8_t	link;
     u_int16_t	irqs;
 } __packed;
 
 struct PIR_entry
 {
     u_int8_t		pe_bus;
     u_int8_t		pe_res1:3;
     u_int8_t		pe_device:5;
     struct PIR_intpin	pe_intpin[4];
     u_int8_t	pe_slot;
     u_int8_t	pe_res3;
 } __packed;
 
 struct PIR_table 
 {
     struct PIR_header	pt_header;
     struct PIR_entry	pt_entry[0];
 } __packed;
 
 /*
  * Int 15:E820 'SMAP' structure
  */
 #define SMAP_SIG	0x534D4150			/* 'SMAP' */
 
 #define	SMAP_TYPE_MEMORY	1
 #define	SMAP_TYPE_RESERVED	2
 #define	SMAP_TYPE_ACPI_RECLAIM	3
 #define	SMAP_TYPE_ACPI_NVS	4
 #define	SMAP_TYPE_ACPI_ERROR	5
 #define	SMAP_TYPE_DISABLED	6
 #define	SMAP_TYPE_PMEM		7
 #define	SMAP_TYPE_PRAM		12
 
 #define	SMAP_XATTR_ENABLED	0x00000001
 #define	SMAP_XATTR_NON_VOLATILE	0x00000002
 #define	SMAP_XATTR_MASK		(SMAP_XATTR_ENABLED | SMAP_XATTR_NON_VOLATILE)
 
 struct bios_smap {
     u_int64_t	base;
     u_int64_t	length;
     u_int32_t	type;
 } __packed;
 
 /* Structure extended to include extended attribute field in ACPI 3.0. */
 struct bios_smap_xattr {
     u_int64_t	base;
     u_int64_t	length;
     u_int32_t	type;
     u_int32_t	xattr;
 } __packed;
 
 /*
  * System Management BIOS
  */
 #define	SMBIOS_START	0xf0000
 #define	SMBIOS_STEP	0x10
 #define	SMBIOS_OFF	0
 #define	SMBIOS_LEN	4
 #define	SMBIOS_SIG	"_SM_"
 
 struct smbios_eps {
 	uint8_t		anchor_string[4];		/* '_SM_' */
 	uint8_t		checksum;
 	uint8_t		length;
 	uint8_t		major_version;
 	uint8_t		minor_version;
 	uint16_t	maximum_structure_size;
 	uint8_t		entry_point_revision;
 	uint8_t		formatted_area[5];
 	uint8_t		intermediate_anchor_string[5];	/* '_DMI_' */
 	uint8_t		intermediate_checksum;
 	uint16_t	structure_table_length;
 	uint32_t	structure_table_address;
 	uint16_t	number_structures;
 	uint8_t		BCD_revision;
 };
 
 struct smbios_structure_header {
 	uint8_t		type;
 	uint8_t		length;
 	uint16_t	handle;
 };
 
 #ifdef _KERNEL
-#define BIOS_PADDRTOVADDR(x)	((x) + KERNBASE)
-#define BIOS_VADDRTOPADDR(x)	((x) - KERNBASE)
+#define BIOS_PADDRTOVADDR(x)	((x) + PMAP_MAP_LOW)
+#define BIOS_VADDRTOPADDR(x)	((x) - PMAP_MAP_LOW)
 
 struct bios_oem_signature {
 	char * anchor;		/* search anchor string in BIOS memory */
 	size_t offset;		/* offset from anchor (may be negative) */
 	size_t totlen;		/* total length of BIOS string to copy */
 } __packed;
 
 struct bios_oem_range {
 	u_int from;		/* shouldn't be below 0xe0000 */
 	u_int to;		/* shouldn't be above 0xfffff */
 } __packed;
 
 struct bios_oem {
 	struct bios_oem_range range;
 	struct bios_oem_signature signature[];
 } __packed;
 
 struct segment_info {
 	u_int	base;
 	u_int	limit;
 };
 
 #define BIOSCODE_FLAG	0x01
 #define BIOSDATA_FLAG	0x02
 #define BIOSUTIL_FLAG	0x04
 #define BIOSARGS_FLAG	0x08
 
 struct bios_segments {
 	struct	segment_info code32;		/* 32-bit code (mandatory) */
 	struct	segment_info code16;		/* 16-bit code */
 	struct	segment_info data;		/* 16-bit data */
 	struct	segment_info util;		/* 16-bit utility */
 	struct	segment_info args;		/* 16-bit args */
 };
 
 struct bios_regs {
 	u_int	eax;
 	u_int	ebx;
 	u_int	ecx;
 	u_int	edx;
 	u_int	esi;
 	u_int	edi;
 };
 
 struct bios_args {
 	u_int	entry;				/* entry point of routine */
 	struct	bios_regs r;
 	struct	bios_segments seg;
 };
 
 /* 
  * BIOS32 Service Directory entry.  Caller supplies name, bios32_SDlookup
  * fills in the rest of the details.
  */
 struct bios32_SDentry 
 {
     union 
     {
 	u_int8_t	name[4];	/* service identifier */
 	u_int32_t	id;		/* as a 32-bit value */
     } ident;
     u_int32_t	base;			/* base of service */
     u_int32_t	len;			/* service length */
     u_int32_t	entry;			/* entrypoint offset from base */
     vm_offset_t	ventry;			/* entrypoint in kernel virtual segment */
 };
 
 /*
  * Exported lookup results 
  */
 extern struct bios32_SDentry	PCIbios;
 
 int	bios_oem_strings(struct bios_oem *oem, u_char *buffer, size_t maxlen);
 uint32_t bios_sigsearch(uint32_t start, u_char *sig, int siglen, int paralen,
 	    int sigofs);
 int	bios16(struct bios_args *, char *, ...);
 int	bios16_call(struct bios_regs *, char *);
 int	bios32(struct bios_regs *, u_int, u_short);
 int	bios32_SDlookup(struct bios32_SDentry *ent);
 void	set_bios_selectors(struct bios_segments *, int);
 
 #endif
 
 #endif /* _MACHINE_PC_BIOS_H_ */
Index: head/sys/i386/include/pcpu.h
===================================================================
--- head/sys/i386/include/pcpu.h	(revision 332488)
+++ head/sys/i386/include/pcpu.h	(revision 332489)
@@ -1,244 +1,251 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_PCPU_H_
 #define	_MACHINE_PCPU_H_
 
 #ifndef _SYS_CDEFS_H_
 #error "sys/cdefs.h is a prerequisite for this file"
 #endif
 
 #include <machine/segments.h>
 #include <machine/tss.h>
 
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
 /*
- * The SMP parts are setup in pmap.c and locore.s for the BSP, and
- * mp_machdep.c sets up the data for the AP's to "see" when they awake.
- * The reason for doing it via a struct is so that an array of pointers
- * to each CPU's data can be set up for things like "check curproc on all
- * other processors"
+ * The SMP parts are setup in pmap.c and machdep.c for the BSP, and
+ * pmap.c and mp_machdep.c sets up the data for the AP's to "see" when
+ * they awake.  The reason for doing it via a struct is so that an
+ * array of pointers to each CPU's data can be set up for things like
+ * "check curproc on all other processors"
  */
 
 #define	PCPU_MD_FIELDS							\
 	char	pc_monitorbuf[128] __aligned(128); /* cache line */	\
 	struct	pcpu *pc_prvspace;	/* Self-reference */		\
 	struct	pmap *pc_curpmap;					\
-	struct	i386tss pc_common_tss;					\
 	struct	segment_descriptor pc_common_tssd;			\
 	struct	segment_descriptor *pc_tss_gdt;				\
 	struct	segment_descriptor *pc_fsgs_gdt;			\
+	struct	i386tss *pc_common_tssp;				\
+	u_int	pc_kesp0;						\
+	u_int	pc_trampstk;						\
 	int	pc_currentldt;						\
 	u_int   pc_acpi_id;		/* ACPI CPU id */		\
 	u_int	pc_apic_id;						\
 	int	pc_private_tss;		/* Flag indicating private tss*/\
 	u_int	pc_cmci_mask;		/* MCx banks for CMCI */	\
 	u_int	pc_vcpu_id;		/* Xen vCPU ID */		\
 	struct	mtx pc_cmap_lock;					\
 	void	*pc_cmap_pte1;						\
 	void	*pc_cmap_pte2;						\
 	caddr_t	pc_cmap_addr1;						\
 	caddr_t	pc_cmap_addr2;						\
 	vm_offset_t pc_qmap_addr;	/* KVA for temporary mappings */\
+	vm_offset_t pc_copyout_maddr;					\
+	vm_offset_t pc_copyout_saddr;					\
+	struct	mtx pc_copyout_mlock;					\
+	struct	sx pc_copyout_slock;					\
+	char	*pc_copyout_buf;					\
 	uint32_t pc_smp_tlb_done;	/* TLB op acknowledgement */	\
-	char	__pad[445]
+	char	__pad[550]
 
 #ifdef _KERNEL
 
 #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF)
 
 /*
  * Evaluates to the byte offset of the per-cpu variable name.
  */
 #define	__pcpu_offset(name)						\
 	__offsetof(struct pcpu, name)
 
 /*
  * Evaluates to the type of the per-cpu variable name.
  */
 #define	__pcpu_type(name)						\
 	__typeof(((struct pcpu *)0)->name)
 
 /*
  * Evaluates to the address of the per-cpu variable name.
  */
 #define	__PCPU_PTR(name) __extension__ ({				\
 	__pcpu_type(name) *__p;						\
 									\
 	__asm __volatile("movl %%fs:%1,%0; addl %2,%0"			\
 	    : "=r" (__p)						\
 	    : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))),	\
 	      "i" (__pcpu_offset(name)));				\
 									\
 	__p;								\
 })
 
 /*
  * Evaluates to the value of the per-cpu variable name.
  */
 #define	__PCPU_GET(name) __extension__ ({				\
 	__pcpu_type(name) __res;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__res), 4)];			\
 	} __s;								\
 									\
 	if (sizeof(__res) == 1 || sizeof(__res) == 2 ||			\
 	    sizeof(__res) == 4) {					\
 		__asm __volatile("mov %%fs:%1,%0"			\
 		    : "=r" (__s)					\
 		    : "m" (*(struct __s *)(__pcpu_offset(name))));	\
 		*(struct __s *)(void *)&__res = __s;			\
 	} else {							\
 		__res = *__PCPU_PTR(name);				\
 	}								\
 	__res;								\
 })
 
 /*
  * Adds a value of the per-cpu counter name.  The implementation
  * must be atomic with respect to interrupts.
  */
 #define	__PCPU_ADD(name, val) do {					\
 	__pcpu_type(name) __val;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__val), 4)];			\
 	} __s;								\
 									\
 	__val = (val);							\
 	if (sizeof(__val) == 1 || sizeof(__val) == 2 ||			\
 	    sizeof(__val) == 4) {					\
 		__s = *(struct __s *)(void *)&__val;			\
 		__asm __volatile("add %1,%%fs:%0"			\
 		    : "=m" (*(struct __s *)(__pcpu_offset(name)))	\
 		    : "r" (__s));					\
 	} else								\
 		*__PCPU_PTR(name) += __val;				\
 } while (0)
 
 /*
  * Increments the value of the per-cpu counter name.  The implementation
  * must be atomic with respect to interrupts.
  */
 #define	__PCPU_INC(name) do {						\
 	CTASSERT(sizeof(__pcpu_type(name)) == 1 ||			\
 	    sizeof(__pcpu_type(name)) == 2 ||				\
 	    sizeof(__pcpu_type(name)) == 4);				\
 	if (sizeof(__pcpu_type(name)) == 1) {				\
 		__asm __volatile("incb %%fs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	} else if (sizeof(__pcpu_type(name)) == 2) {			\
 		__asm __volatile("incw %%fs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	} else if (sizeof(__pcpu_type(name)) == 4) {			\
 		__asm __volatile("incl %%fs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	}								\
 } while (0)
 
 /*
  * Sets the value of the per-cpu variable name to value val.
  */
 #define	__PCPU_SET(name, val) do {					\
 	__pcpu_type(name) __val;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__val), 4)];			\
 	} __s;								\
 									\
 	__val = (val);							\
 	if (sizeof(__val) == 1 || sizeof(__val) == 2 ||			\
 	    sizeof(__val) == 4) {					\
 		__s = *(struct __s *)(void *)&__val;			\
 		__asm __volatile("mov %1,%%fs:%0"			\
 		    : "=m" (*(struct __s *)(__pcpu_offset(name)))	\
 		    : "r" (__s));					\
 	} else {							\
 		*__PCPU_PTR(name) = __val;				\
 	}								\
 } while (0)
 
 #define	get_pcpu() __extension__ ({					\
 	struct pcpu *__pc;						\
 									\
 	__asm __volatile("movl %%fs:%1,%0"				\
 	    : "=r" (__pc)						\
 	    : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))));	\
 	__pc;								\
 })
 
 #define	PCPU_GET(member)	__PCPU_GET(pc_ ## member)
 #define	PCPU_ADD(member, val)	__PCPU_ADD(pc_ ## member, val)
 #define	PCPU_INC(member)	__PCPU_INC(pc_ ## member)
 #define	PCPU_PTR(member)	__PCPU_PTR(pc_ ## member)
 #define	PCPU_SET(member, val)	__PCPU_SET(pc_ ## member, val)
 
 #define	OFFSETOF_CURTHREAD	0
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wnull-dereference"
 #endif
 static __inline __pure2 struct thread *
 __curthread(void)
 {
 	struct thread *td;
 
 	__asm("movl %%fs:%1,%0" : "=r" (td)
 	    : "m" (*(char *)OFFSETOF_CURTHREAD));
 	return (td);
 }
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 #define	curthread		(__curthread())
 
 #define	OFFSETOF_CURPCB		16
 static __inline __pure2 struct pcb *
 __curpcb(void)
 {
 	struct pcb *pcb;
 
 	__asm("movl %%fs:%1,%0" : "=r" (pcb) : "m" (*(char *)OFFSETOF_CURPCB));
 	return (pcb);
 }
 #define	curpcb		(__curpcb())
 
 #else /* defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) */
 
 #error "this file needs to be ported to your compiler"
 
 #endif /* __GNUCLIKE_ASM etc. */
 
 #endif /* _KERNEL */
 
 #endif /* !_MACHINE_PCPU_H_ */
Index: head/sys/i386/include/pmap.h
===================================================================
--- head/sys/i386/include/pmap.h	(revision 332488)
+++ head/sys/i386/include/pmap.h	(revision 332489)
@@ -1,406 +1,408 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Derived from hp300 version by Mike Hibler, this version by William
  * Jolitz uses a recursive map [a pde points to the page directory] to
  * map the page tables using the pagetables themselves. This is done to
  * reduce the impact on kernel virtual memory for lots of sparse address
  * space, and to reduce the cost of memory to each process.
  *
  *	from: hp300: @(#)pmap.h	7.2 (Berkeley) 12/16/90
  *	from: @(#)pmap.h	7.4 (Berkeley) 5/12/91
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_PMAP_H_
 #define	_MACHINE_PMAP_H_
 
 /*
  * Page-directory and page-table entries follow this format, with a few
  * of the fields not present here and there, depending on a lot of things.
  */
 				/* ---- Intel Nomenclature ---- */
 #define	PG_V		0x001	/* P	Valid			*/
 #define PG_RW		0x002	/* R/W	Read/Write		*/
 #define PG_U		0x004	/* U/S  User/Supervisor		*/
 #define	PG_NC_PWT	0x008	/* PWT	Write through		*/
 #define	PG_NC_PCD	0x010	/* PCD	Cache disable		*/
 #define PG_A		0x020	/* A	Accessed		*/
 #define	PG_M		0x040	/* D	Dirty			*/
 #define	PG_PS		0x080	/* PS	Page size (0=4k,1=4M)	*/
 #define	PG_PTE_PAT	0x080	/* PAT	PAT index		*/
 #define	PG_G		0x100	/* G	Global			*/
 #define	PG_AVAIL1	0x200	/*    /	Available for system	*/
 #define	PG_AVAIL2	0x400	/*   <	programmers use		*/
 #define	PG_AVAIL3	0x800	/*    \				*/
 #define	PG_PDE_PAT	0x1000	/* PAT	PAT index		*/
 #if defined(PAE) || defined(PAE_TABLES)
 #define	PG_NX		(1ull<<63) /* No-execute */
 #endif
 
 
 /* Our various interpretations of the above */
 #define PG_W		PG_AVAIL1	/* "Wired" pseudoflag */
 #define	PG_MANAGED	PG_AVAIL2
 #define	PG_PROMOTED	PG_AVAIL3	/* PDE only */
 #if defined(PAE) || defined(PAE_TABLES)
 #define	PG_FRAME	(0x000ffffffffff000ull)
 #define	PG_PS_FRAME	(0x000fffffffe00000ull)
 #else
 #define	PG_FRAME	(~PAGE_MASK)
 #define	PG_PS_FRAME	(0xffc00000)
 #endif
 #define	PG_PROT		(PG_RW|PG_U)	/* all protection bits . */
 #define PG_N		(PG_NC_PWT|PG_NC_PCD)	/* Non-cacheable */
 
 /* Page level cache control fields used to determine the PAT type */
 #define PG_PDE_CACHE	(PG_PDE_PAT | PG_NC_PWT | PG_NC_PCD)
 #define PG_PTE_CACHE	(PG_PTE_PAT | PG_NC_PWT | PG_NC_PCD)
 
 /*
  * Promotion to a 2 or 4MB (PDE) page mapping requires that the corresponding
  * 4KB (PTE) page mappings have identical settings for the following fields:
  */
 #define PG_PTE_PROMOTE	(PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \
 	    PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V)
 
 /*
  * Page Protection Exception bits
  */
 
 #define PGEX_P		0x01	/* Protection violation vs. not present */
 #define PGEX_W		0x02	/* during a Write cycle */
 #define PGEX_U		0x04	/* access from User mode (UPL) */
 #define PGEX_RSV	0x08	/* reserved PTE field is non-zero */
 #define PGEX_I		0x10	/* during an instruction fetch */
 
 /*
  * Size of Kernel address space.  This is the number of page table pages
  * (4MB each) to use for the kernel.  256 pages == 1 Gigabyte.
  * This **MUST** be a multiple of 4 (eg: 252, 256, 260, etc).
  * For PAE, the page table page unit size is 2MB.  This means that 512 pages
  * is 1 Gigabyte.  Double everything.  It must be a multiple of 8 for PAE.
  */
-#ifndef KVA_PAGES
 #if defined(PAE) || defined(PAE_TABLES)
-#define KVA_PAGES	512
+#define KVA_PAGES	(512*4)
 #else
-#define KVA_PAGES	256
+#define KVA_PAGES	(256*4)
 #endif
-#endif
 
 /*
  * Pte related macros
  */
 #define VADDR(pdi, pti) ((vm_offset_t)(((pdi)<<PDRSHIFT)|((pti)<<PAGE_SHIFT)))
 
 /*
  * The initial number of kernel page table pages that are constructed
  * by locore must be sufficient to map vm_page_array.  That number can
  * be calculated as follows:
  *     max_phys / PAGE_SIZE * sizeof(struct vm_page) / NBPDR
  * PAE:      max_phys 16G, sizeof(vm_page) 76, NBPDR 2M, 152 page table pages.
  * PAE_TABLES: max_phys 4G,  sizeof(vm_page) 68, NBPDR 2M, 36 page table pages.
  * Non-PAE:  max_phys 4G,  sizeof(vm_page) 68, NBPDR 4M, 18 page table pages.
  */
 #ifndef NKPT
 #if defined(PAE)
 #define	NKPT		240
 #elif defined(PAE_TABLES)
 #define	NKPT		60
 #else
 #define	NKPT		30
 #endif
 #endif
 
 #ifndef NKPDE
 #define NKPDE	(KVA_PAGES)	/* number of page tables/pde's */
 #endif
 
 /*
  * The *PTDI values control the layout of virtual memory
- *
- * XXX This works for now, but I am not real happy with it, I'll fix it
- * right after I fix locore.s and the magic 28K hole
  */
-#define	KPTDI		(NPDEPTD-NKPDE)	/* start of kernel virtual pde's */
-#define	PTDPTDI		(KPTDI-NPGPTD)	/* ptd entry that points to ptd! */
+#define	KPTDI		0		/* start of kernel virtual pde's */
+#define	LOWPTDI		1		/* low memory map pde */
+#define	KERNPTDI	2		/* start of kernel text pde */
+#define	PTDPTDI		(NPDEPTD - 1 - NPGPTD)	/* ptd entry that points
+						   to ptd! */
+#define	TRPTDI		(NPDEPTD - 1)	/* u/k trampoline ptd */
 
 /*
  * XXX doesn't really belong here I guess...
  */
 #define ISA_HOLE_START    0xa0000
 #define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START)
 
 #ifndef LOCORE
 
 #include <sys/queue.h>
 #include <sys/_cpuset.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
 #include <vm/_vm_radix.h>
 
 #if defined(PAE) || defined(PAE_TABLES)
 
 typedef uint64_t pdpt_entry_t;
 typedef uint64_t pd_entry_t;
 typedef uint64_t pt_entry_t;
 
 #define	PTESHIFT	(3)
 #define	PDESHIFT	(3)
 
 #else
 
 typedef uint32_t pd_entry_t;
 typedef uint32_t pt_entry_t;
 
 #define	PTESHIFT	(2)
 #define	PDESHIFT	(2)
 
 #endif
 
 /*
  * Address of current address space page table maps and directories.
  */
 #ifdef _KERNEL
 extern pt_entry_t PTmap[];
 extern pd_entry_t PTD[];
 extern pd_entry_t PTDpde[];
 
 #if defined(PAE) || defined(PAE_TABLES)
 extern pdpt_entry_t *IdlePDPT;
 #endif
 extern pd_entry_t *IdlePTD;	/* physical address of "Idle" state directory */
 
 /*
  * Translate a virtual address to the kernel virtual address of its page table
  * entry (PTE).  This can be used recursively.  If the address of a PTE as
  * previously returned by this macro is itself given as the argument, then the
  * address of the page directory entry (PDE) that maps the PTE will be
  * returned.
  *
  * This macro may be used before pmap_bootstrap() is called.
  */
 #define	vtopte(va)	(PTmap + i386_btop(va))
 
 /*
  * Translate a virtual address to its physical address.
  *
  * This macro may be used before pmap_bootstrap() is called.
  */
 #define	vtophys(va)	pmap_kextract((vm_offset_t)(va))
 
 /*
  * KPTmap is a linear mapping of the kernel page table.  It differs from the
  * recursive mapping in two ways: (1) it only provides access to kernel page
  * table pages, and not user page table pages, and (2) it provides access to
  * a kernel page table page after the corresponding virtual addresses have
  * been promoted to a 2/4MB page mapping.
  *
  * KPTmap is first initialized by locore to support just NPKT page table
  * pages.  Later, it is reinitialized by pmap_bootstrap() to allow for
  * expansion of the kernel page table.
  */
 extern pt_entry_t *KPTmap;
 
 /*
  * Extract from the kernel page table the physical address that is mapped by
  * the given virtual address "va".
  *
  * This function may be used before pmap_bootstrap() is called.
  */
 static __inline vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	vm_paddr_t pa;
 
 	if ((pa = PTD[va >> PDRSHIFT]) & PG_PS) {
 		pa = (pa & PG_PS_FRAME) | (va & PDRMASK);
 	} else {
 		/*
 		 * Beware of a concurrent promotion that changes the PDE at
 		 * this point!  For example, vtopte() must not be used to
 		 * access the PTE because it would use the new PDE.  It is,
 		 * however, safe to use the old PDE because the page table
 		 * page is preserved by the promotion.
 		 */
 		pa = KPTmap[i386_btop(va)];
 		pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 	}
 	return (pa);
 }
 
 #if (defined(PAE) || defined(PAE_TABLES))
 
 #define	pde_cmpset(pdep, old, new)	atomic_cmpset_64_i586(pdep, old, new)
 #define	pte_load_store(ptep, pte)	atomic_swap_64_i586(ptep, pte)
 #define	pte_load_clear(ptep)		atomic_swap_64_i586(ptep, 0)
 #define	pte_store(ptep, pte)		atomic_store_rel_64_i586(ptep, pte)
 
 extern pt_entry_t pg_nx;
 
 #else /* !(PAE || PAE_TABLES) */
 
 #define	pde_cmpset(pdep, old, new)	atomic_cmpset_int(pdep, old, new)
 #define	pte_load_store(ptep, pte)	atomic_swap_int(ptep, pte)
 #define	pte_load_clear(ptep)		atomic_swap_int(ptep, 0)
 #define	pte_store(ptep, pte) do { \
 	*(u_int *)(ptep) = (u_int)(pte); \
 } while (0)
 
 #endif /* !(PAE || PAE_TABLES) */
 
 #define	pte_clear(ptep)			pte_store(ptep, 0)
 
 #define	pde_store(pdep, pde)		pte_store(pdep, pde)
 
 #endif /* _KERNEL */
 
 /*
  * Pmap stuff
  */
 struct	pv_entry;
 struct	pv_chunk;
 
 struct md_page {
 	TAILQ_HEAD(,pv_entry)	pv_list;
 	int			pat_mode;
 };
 
 struct pmap {
 	struct mtx		pm_mtx;
 	pd_entry_t		*pm_pdir;	/* KVA of page directory */
 	TAILQ_HEAD(,pv_chunk)	pm_pvchunk;	/* list of mappings in pmap */
 	cpuset_t		pm_active;	/* active on cpus */
 	struct pmap_statistics	pm_stats;	/* pmap statistics */
 	LIST_ENTRY(pmap) 	pm_list;	/* List of all pmaps */
 #if defined(PAE) || defined(PAE_TABLES)
 	pdpt_entry_t		*pm_pdpt;	/* KVA of page directory pointer
 						   table */
 #endif
 	struct vm_radix		pm_root;	/* spare page table pages */
+	vm_page_t		pm_ptdpg[NPGPTD];
 };
 
 typedef struct pmap	*pmap_t;
 
 #ifdef _KERNEL
 extern struct pmap	kernel_pmap_store;
 #define kernel_pmap	(&kernel_pmap_store)
 
 #define	PMAP_LOCK(pmap)		mtx_lock(&(pmap)->pm_mtx)
 #define	PMAP_LOCK_ASSERT(pmap, type) \
 				mtx_assert(&(pmap)->pm_mtx, (type))
 #define	PMAP_LOCK_DESTROY(pmap)	mtx_destroy(&(pmap)->pm_mtx)
 #define	PMAP_LOCK_INIT(pmap)	mtx_init(&(pmap)->pm_mtx, "pmap", \
 				    NULL, MTX_DEF | MTX_DUPOK)
 #define	PMAP_LOCKED(pmap)	mtx_owned(&(pmap)->pm_mtx)
 #define	PMAP_MTX(pmap)		(&(pmap)->pm_mtx)
 #define	PMAP_TRYLOCK(pmap)	mtx_trylock(&(pmap)->pm_mtx)
 #define	PMAP_UNLOCK(pmap)	mtx_unlock(&(pmap)->pm_mtx)
 #endif
 
 /*
  * For each vm_page_t, there is a list of all currently valid virtual
  * mappings of that page.  An entry is a pv_entry_t, the list is pv_list.
  */
 typedef struct pv_entry {
 	vm_offset_t	pv_va;		/* virtual address for mapping */
 	TAILQ_ENTRY(pv_entry)	pv_next;
 } *pv_entry_t;
 
 /*
  * pv_entries are allocated in chunks per-process.  This avoids the
  * need to track per-pmap assignments.
  */
 #define	_NPCM	11
 #define	_NPCPV	336
 struct pv_chunk {
 	pmap_t			pc_pmap;
 	TAILQ_ENTRY(pv_chunk)	pc_list;
 	uint32_t		pc_map[_NPCM];	/* bitmap; 1 = free */
 	TAILQ_ENTRY(pv_chunk)	pc_lru;
 	struct pv_entry		pc_pventry[_NPCPV];
 };
 
 #ifdef	_KERNEL
 
 extern caddr_t CADDR3;
 extern pt_entry_t *CMAP3;
 extern vm_paddr_t phys_avail[];
 extern vm_paddr_t dump_avail[];
 extern int pseflag;
 extern int pgeflag;
 extern char *ptvmmap;		/* poor name! */
 extern vm_offset_t virtual_avail;
 extern vm_offset_t virtual_end;
 
 #define	pmap_page_get_memattr(m)	((vm_memattr_t)(m)->md.pat_mode)
 #define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
 #define	pmap_unmapbios(va, sz)	pmap_unmapdev((va), (sz))
 
 /*
  * Only the following functions or macros may be used before pmap_bootstrap()
  * is called: pmap_kenter(), pmap_kextract(), pmap_kremove(), vtophys(), and
  * vtopte().
  */
 void	pmap_bootstrap(vm_paddr_t);
 int	pmap_cache_bits(int mode, boolean_t is_pde);
 int	pmap_change_attr(vm_offset_t, vm_size_t, int);
 void	pmap_init_pat(void);
 void	pmap_kenter(vm_offset_t va, vm_paddr_t pa);
 void	*pmap_kenter_temporary(vm_paddr_t pa, int i);
 void	pmap_kremove(vm_offset_t);
 void	*pmap_mapbios(vm_paddr_t, vm_size_t);
 void	*pmap_mapdev(vm_paddr_t, vm_size_t);
 void	*pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
 boolean_t pmap_page_is_mapped(vm_page_t m);
 void	pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
 void	pmap_unmapdev(vm_offset_t, vm_size_t);
 pt_entry_t *pmap_pte(pmap_t, vm_offset_t) __pure2;
 void	pmap_invalidate_page(pmap_t, vm_offset_t);
 void	pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t);
 void	pmap_invalidate_all(pmap_t);
 void	pmap_invalidate_cache(void);
 void	pmap_invalidate_cache_pages(vm_page_t *pages, int count);
 void	pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva,
 	    boolean_t force);
+void	*pmap_trm_alloc(size_t size, int flags);
+void	pmap_trm_free(void *addr, size_t size);
 
 void	invltlb_glob(void);
 
 #endif /* _KERNEL */
 
 #endif /* !LOCORE */
 
 #endif /* !_MACHINE_PMAP_H_ */
Index: head/sys/i386/include/segments.h
===================================================================
--- head/sys/i386/include/segments.h	(revision 332488)
+++ head/sys/i386/include/segments.h	(revision 332489)
@@ -1,100 +1,99 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1990 William F. Jolitz
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)segments.h	7.1 (Berkeley) 5/9/91
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_SEGMENTS_H_
 #define	_MACHINE_SEGMENTS_H_
 
 /*
  * 386 Segmentation Data Structures and definitions
  *	William F. Jolitz (william@ernie.berkeley.edu) 6/20/1989
  */
 
 #include <x86/segments.h>
 
 /*
  * Software definitions are in this convenient format,
  * which are translated into inconvenient segment descriptors
  * when needed to be used by the 386 hardware
  */
 
 struct	soft_segment_descriptor	{
 	unsigned ssd_base ;		/* segment base address  */
 	unsigned ssd_limit ;		/* segment extent */
 	unsigned ssd_type:5 ;		/* segment type */
 	unsigned ssd_dpl:2 ;		/* segment descriptor priority level */
 	unsigned ssd_p:1 ;		/* segment descriptor present */
 	unsigned ssd_xx:4 ;		/* unused */
 	unsigned ssd_xx1:2 ;		/* unused */
 	unsigned ssd_def32:1 ;		/* default 32 vs 16 bit size */
 	unsigned ssd_gran:1 ;		/* limit granularity (byte/page units)*/
 };
 
 /*
  * region descriptors, used to load gdt/idt tables before segments yet exist.
  */
 struct region_descriptor {
 	unsigned rd_limit:16;		/* segment extent */
 	unsigned rd_base:32 __packed;	/* base address  */
 };
 
 /*
  * Segment Protection Exception code bits
  */
 
 #define	SEGEX_EXT	0x01	/* recursive or externally induced */
 #define	SEGEX_IDT	0x02	/* interrupt descriptor table */
 #define	SEGEX_TI	0x04	/* local descriptor table */
 				/* other bits are affected descriptor index */
 #define SEGEX_IDX(s)	(((s)>>3)&0x1fff)
 
 #ifdef _KERNEL
 extern int	_default_ldt;
-extern union descriptor gdt[];
-extern union descriptor ldt[NLDT];
+extern union descriptor *gdt;
+extern union descriptor *ldt;
 extern struct soft_segment_descriptor gdt_segs[];
 extern struct gate_descriptor *idt;
-extern struct region_descriptor r_gdt, r_idt;
 
 void	lgdt(struct region_descriptor *rdp);
 void	sdtossd(struct segment_descriptor *sdp,
 	    struct soft_segment_descriptor *ssdp);
 void	ssdtosd(struct soft_segment_descriptor *ssdp,
 	    struct segment_descriptor *sdp);
 #endif /* _KERNEL */
 
 #endif /* !_MACHINE_SEGMENTS_H_ */
Index: head/sys/i386/include/vmparam.h
===================================================================
--- head/sys/i386/include/vmparam.h	(revision 332488)
+++ head/sys/i386/include/vmparam.h	(revision 332489)
@@ -1,209 +1,233 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vmparam.h	5.9 (Berkeley) 5/12/91
  * $FreeBSD$
  */
 
 
 #ifndef _MACHINE_VMPARAM_H_
 #define _MACHINE_VMPARAM_H_ 1
 
 /*
  * Machine dependent constants for 386.
  */
 
 /*
  * Virtual memory related constants, all in bytes
  */
 #define	MAXTSIZ		(128UL*1024*1024)	/* max text size */
 #ifndef DFLDSIZ
 #define	DFLDSIZ		(128UL*1024*1024)	/* initial data size limit */
 #endif
 #ifndef MAXDSIZ
 #define	MAXDSIZ		(512UL*1024*1024)	/* max data size */
 #endif
 #ifndef	DFLSSIZ
 #define	DFLSSIZ		(8UL*1024*1024)		/* initial stack size limit */
 #endif
 #ifndef	MAXSSIZ
 #define	MAXSSIZ		(64UL*1024*1024)	/* max stack size */
 #endif
 #ifndef SGROWSIZ
 #define SGROWSIZ	(128UL*1024)		/* amount to grow stack */
 #endif
 
 /*
  * Choose between DENSE and SPARSE based on whether lower execution time or
  * lower kernel address space consumption is desired.  Under PAE, kernel
  * address space is often in short supply.
  */
 #ifdef PAE
 #define	VM_PHYSSEG_SPARSE
 #else
 #define	VM_PHYSSEG_DENSE
 #endif
 
 /*
  * The number of PHYSSEG entries must be one greater than the number
  * of phys_avail entries because the phys_avail entry that spans the
  * largest physical address that is accessible by ISA DMA is split
  * into two PHYSSEG entries. 
  */
 #define	VM_PHYSSEG_MAX		17
 
 /*
  * Create one free page pool.  Since the i386 kernel virtual address
  * space does not include a mapping onto the machine's entire physical
  * memory, VM_FREEPOOL_DIRECT is defined as an alias for the default
  * pool, VM_FREEPOOL_DEFAULT.
  */
 #define	VM_NFREEPOOL		1
 #define	VM_FREEPOOL_DEFAULT	0
 #define	VM_FREEPOOL_DIRECT	0
 
 /*
  * Create two free page lists: VM_FREELIST_DEFAULT is for physical
  * pages that are above the largest physical address that is
  * accessible by ISA DMA and VM_FREELIST_ISADMA is for physical pages
  * that are below that address.
  */
 #define	VM_NFREELIST		2
 #define	VM_FREELIST_DEFAULT	0
 #define	VM_FREELIST_ISADMA	1
 
 /*
  * The largest allocation size is 2MB under PAE and 4MB otherwise.
  */
 #ifdef PAE
 #define	VM_NFREEORDER		10
 #else
 #define	VM_NFREEORDER		11
 #endif
 
 /*
  * Enable superpage reservations: 1 level.
  */
 #ifndef	VM_NRESERVLEVEL
 #define	VM_NRESERVLEVEL		1
 #endif
 
 /*
  * Level 0 reservations consist of 512 pages when PAE pagetables are
  * used, and 1024 pages otherwise.
  */
 #ifndef	VM_LEVEL_0_ORDER
 #if defined(PAE) || defined(PAE_TABLES)
 #define	VM_LEVEL_0_ORDER	9
 #else
 #define	VM_LEVEL_0_ORDER	10
 #endif
 #endif
 
 /*
  * Kernel physical load address.
  */
 #ifndef KERNLOAD
-#define	KERNLOAD		(1 << PDRSHIFT)
+#define	KERNLOAD		(KERNPTDI << PDRSHIFT)
 #endif /* !defined(KERNLOAD) */
 
 /*
  * Virtual addresses of things.  Derived from the page directory and
  * page table indexes from pmap.h for precision.
  * Because of the page that is both a PD and PT, it looks a little
  * messy at times, but hey, we'll do anything to save a page :-)
  */
 
-#define VM_MAX_KERNEL_ADDRESS	VADDR(KPTDI+NKPDE-1, NPTEPG-1)
+#define VM_MAX_KERNEL_ADDRESS	VADDR(PTDPTDI, 0)
 
-#define VM_MIN_KERNEL_ADDRESS	VADDR(PTDPTDI, PTDPTDI)
+#define VM_MIN_KERNEL_ADDRESS	0
 
-#define	KERNBASE		VADDR(KPTDI, 0)
+#define	KERNBASE		KERNLOAD
 
 #define UPT_MAX_ADDRESS		VADDR(PTDPTDI, PTDPTDI)
 #define UPT_MIN_ADDRESS		VADDR(PTDPTDI, 0)
 
-#define VM_MAXUSER_ADDRESS	VADDR(PTDPTDI, 0)
+#define VM_MAXUSER_ADDRESS	VADDR(TRPTDI, 0)
 
 #define	SHAREDPAGE		(VM_MAXUSER_ADDRESS - PAGE_SIZE)
 #define	USRSTACK		SHAREDPAGE
 
-#define VM_MAX_ADDRESS		VADDR(PTDPTDI, PTDPTDI)
+#define VM_MAX_ADDRESS		VADDR(PTDPTDI, 0)
 #define VM_MIN_ADDRESS		((vm_offset_t)0)
+
+#define	PMAP_TRM_MIN_ADDRESS	VM_MAXUSER_ADDRESS
+#define	PMAP_TRM_MAX_ADDRESS	0xffffffff
+
+#define	PMAP_MAP_LOW		VADDR(LOWPTDI, 0)
+
+/*
+ * KVA layout.  The unit of the system allocation is single PDE, which
+ * represents NBPDR bytes, aligned to NBPDR.  NBPDR is 4M for non-PAE
+ * page tables, and 2M for PAE.  Addresses below are shown for non-PAE.
+ *
+ * 0x00000000 - 0x003fffff	Transient identity map of low memory (0-4M),
+ *				normally disabled to catch NULL derefs.
+ * 0x00400000 - 0x007fffff	Fixed mapping of the low memory (0-4M).
+ * 0x00800000 - 0xffbfffff	KERNBASE (VA) == KERNLOAD (PA), kernel
+ *				text + data and all kernel maps.  Managed
+ *				by MI VM.
+ * 0xffc00000 - 0xffdfffff	Recursive kernel page table mapping, pointed
+ *				to by PTmap.  PTD[] recusively points
+ *				into PTmap.
+ * 0xffe00000 - 0xffffffff	Kernel/User mode shared PDE, contains GDT,
+ *				IDT, TSS, LDT, trampoline code and stacks.
+ *				Managed by pmap_trm_alloc().
+ */
 
 /*
  * How many physical pages per kmem arena virtual page.
  */
 #ifndef VM_KMEM_SIZE_SCALE
 #define	VM_KMEM_SIZE_SCALE	(3)
 #endif
 
 /*
  * Optional floor (in bytes) on the size of the kmem arena.
  */
 #ifndef VM_KMEM_SIZE_MIN
 #define	VM_KMEM_SIZE_MIN	(12 * 1024 * 1024)
 #endif
 
 /*
  * Optional ceiling (in bytes) on the size of the kmem arena: 40% of the
  * kernel map rounded to the nearest multiple of the superpage size.
  */
 #ifndef VM_KMEM_SIZE_MAX
 #define	VM_KMEM_SIZE_MAX	(((((VM_MAX_KERNEL_ADDRESS - \
     VM_MIN_KERNEL_ADDRESS) >> (PDRSHIFT - 2)) + 5) / 10) << PDRSHIFT)
 #endif
 
 /* initial pagein size of beginning of executable file */
 #ifndef VM_INITIAL_PAGEIN
 #define	VM_INITIAL_PAGEIN	16
 #endif
 
 #define	ZERO_REGION_SIZE	(64 * 1024)	/* 64KB */
 
 #ifndef VM_MAX_AUTOTUNE_MAXUSERS
 #define VM_MAX_AUTOTUNE_MAXUSERS 384
 #endif
 
 #define	SFBUF
 #define	SFBUF_MAP
 #define	SFBUF_CPUSET
 #define	SFBUF_PROCESS_PAGE
 
 #define	PMAP_HAS_DMAP	0
 #define	PHYS_TO_DMAP(x)	({ panic("No direct map exists"); 0; })
 #define	DMAP_TO_PHYS(x)	({ panic("No direct map exists"); 0; })
 
 #endif /* _MACHINE_VMPARAM_H_ */
Index: head/sys/kern/imgact_aout.c
===================================================================
--- head/sys/kern/imgact_aout.c	(revision 332488)
+++ head/sys/kern/imgact_aout.c	(revision 332489)
@@ -1,344 +1,348 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <machine/frame.h>
 #include <machine/md_var.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_param.h>
 
 #ifdef __amd64__
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/ia32/ia32_signal.h>
 #endif
 
 static int	exec_aout_imgact(struct image_params *imgp);
 static int	aout_fixup(register_t **stack_base, struct image_params *imgp);
 
+#define	AOUT32_USRSTACK		0xbfc00000
+
 #if defined(__i386__)
+
+#define	AOUT32_PS_STRINGS	(AOUT32_USRSTACK - sizeof(struct ps_strings))
+
 struct sysentvec aout_sysvec = {
 	.sv_size	= SYS_MAXSYSCALL,
 	.sv_table	= sysent,
 	.sv_mask	= 0,
 	.sv_errsize	= 0,
 	.sv_errtbl	= NULL,
 	.sv_transtrap	= NULL,
 	.sv_fixup	= aout_fixup,
 	.sv_sendsig	= sendsig,
 	.sv_sigcode	= sigcode,
 	.sv_szsigcode	= &szsigcode,
 	.sv_name	= "FreeBSD a.out",
 	.sv_coredump	= NULL,
 	.sv_imgact_try	= NULL,
 	.sv_minsigstksz	= MINSIGSTKSZ,
 	.sv_pagesize	= PAGE_SIZE,
 	.sv_minuser	= VM_MIN_ADDRESS,
-	.sv_maxuser	= VM_MAXUSER_ADDRESS,
-	.sv_usrstack	= USRSTACK,
-	.sv_psstrings	= PS_STRINGS,
+	.sv_maxuser	= AOUT32_USRSTACK,
+	.sv_usrstack	= AOUT32_USRSTACK,
+	.sv_psstrings	= AOUT32_PS_STRINGS,
 	.sv_stackprot	= VM_PROT_ALL,
 	.sv_copyout_strings	= exec_copyout_strings,
 	.sv_setregs	= exec_setregs,
 	.sv_fixlimit	= NULL,
 	.sv_maxssiz	= NULL,
 	.sv_flags	= SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
 	.sv_set_syscall_retval = cpu_set_syscall_retval,
 	.sv_fetch_syscall_args = cpu_fetch_syscall_args,
 	.sv_syscallnames = syscallnames,
 	.sv_schedtail	= NULL,
 	.sv_thread_detach = NULL,
 	.sv_trap	= NULL,
 };
 
 #elif defined(__amd64__)
 
-#define	AOUT32_USRSTACK	0xbfc00000
 #define	AOUT32_PS_STRINGS \
     (AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
-#define	AOUT32_MINUSER	FREEBSD32_MINUSER
+#define	AOUT32_MINUSER		FREEBSD32_MINUSER
 
 extern const char *freebsd32_syscallnames[];
 extern u_long ia32_maxssiz;
 
 struct sysentvec aout_sysvec = {
 	.sv_size	= FREEBSD32_SYS_MAXSYSCALL,
 	.sv_table	= freebsd32_sysent,
 	.sv_mask	= 0,
 	.sv_errsize	= 0,
 	.sv_errtbl	= NULL,
 	.sv_transtrap	= NULL,
 	.sv_fixup	= aout_fixup,
 	.sv_sendsig	= ia32_sendsig,
 	.sv_sigcode	= ia32_sigcode,
 	.sv_szsigcode	= &sz_ia32_sigcode,
 	.sv_name	= "FreeBSD a.out",
 	.sv_coredump	= NULL,
 	.sv_imgact_try	= NULL,
 	.sv_minsigstksz	= MINSIGSTKSZ,
 	.sv_pagesize	= IA32_PAGE_SIZE,
 	.sv_minuser	= AOUT32_MINUSER,
 	.sv_maxuser	= AOUT32_USRSTACK,
 	.sv_usrstack	= AOUT32_USRSTACK,
 	.sv_psstrings	= AOUT32_PS_STRINGS,
 	.sv_stackprot	= VM_PROT_ALL,
 	.sv_copyout_strings	= freebsd32_copyout_strings,
 	.sv_setregs	= ia32_setregs,
 	.sv_fixlimit	= ia32_fixlimit,
 	.sv_maxssiz	= &ia32_maxssiz,
 	.sv_flags	= SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
 	.sv_set_syscall_retval = ia32_set_syscall_retval,
 	.sv_fetch_syscall_args = ia32_fetch_syscall_args,
 	.sv_syscallnames = freebsd32_syscallnames,
 };
 #else
 #error "Port me"
 #endif
 
 static int
 aout_fixup(register_t **stack_base, struct image_params *imgp)
 {
 
 	*(char **)stack_base -= sizeof(uint32_t);
 	return (suword32(*stack_base, imgp->args->argc));
 }
 
 static int
 exec_aout_imgact(struct image_params *imgp)
 {
 	const struct exec *a_out = (const struct exec *) imgp->image_header;
 	struct vmspace *vmspace;
 	vm_map_t map;
 	vm_object_t object;
 	vm_offset_t text_end, data_end;
 	unsigned long virtual_offset;
 	unsigned long file_offset;
 	unsigned long bss_size;
 	int error;
 
 	/*
 	 * Linux and *BSD binaries look very much alike,
 	 * only the machine id is different:
 	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
 	 * NetBSD is in network byte order.. ugh.
 	 */
 	if (((a_out->a_midmag >> 16) & 0xff) != 0x86 &&
 	    ((a_out->a_midmag >> 16) & 0xff) != 0 &&
 	    ((((int)ntohl(a_out->a_midmag)) >> 16) & 0xff) != 0x86)
                 return -1;
 
 	/*
 	 * Set file/virtual offset based on a.out variant.
 	 *	We do two cases: host byte order and network byte order
 	 *	(for NetBSD compatibility)
 	 */
 	switch ((int)(a_out->a_midmag & 0xffff)) {
 	case ZMAGIC:
 		virtual_offset = 0;
 		if (a_out->a_text) {
 			file_offset = PAGE_SIZE;
 		} else {
 			/* Bill's "screwball mode" */
 			file_offset = 0;
 		}
 		break;
 	case QMAGIC:
 		virtual_offset = PAGE_SIZE;
 		file_offset = 0;
 		/* Pass PS_STRINGS for BSD/OS binaries only. */
 		if (N_GETMID(*a_out) == MID_ZERO)
 			imgp->ps_strings = aout_sysvec.sv_psstrings;
 		break;
 	default:
 		/* NetBSD compatibility */
 		switch ((int)(ntohl(a_out->a_midmag) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			virtual_offset = PAGE_SIZE;
 			file_offset = 0;
 			break;
 		default:
 			return (-1);
 		}
 	}
 
 	bss_size = roundup(a_out->a_bss, PAGE_SIZE);
 
 	/*
 	 * Check various fields in header for validity/bounds.
 	 */
 	if (/* entry point must lay with text region */
 	    a_out->a_entry < virtual_offset ||
 	    a_out->a_entry >= virtual_offset + a_out->a_text ||
 
 	    /* text and data size must each be page rounded */
 	    a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK
 
 #ifdef __amd64__
 	    ||
 	    /* overflows */
 	    virtual_offset + a_out->a_text + a_out->a_data + bss_size > UINT_MAX
 #endif
 	    )
 		return (-1);
 
 	/* text + data can't exceed file size */
 	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
 		return (EFAULT);
 
 	/*
 	 * text/data/bss must not exceed limits
 	 */
 	PROC_LOCK(imgp->proc);
 	if (/* text can't exceed maximum text size */
 	    a_out->a_text > maxtsiz ||
 
 	    /* data + bss can't exceed rlimit */
 	    a_out->a_data + bss_size > lim_cur_proc(imgp->proc, RLIMIT_DATA) ||
 	    racct_set(imgp->proc, RACCT_DATA, a_out->a_data + bss_size) != 0) {
 			PROC_UNLOCK(imgp->proc);
 			return (ENOMEM);
 	}
 	PROC_UNLOCK(imgp->proc);
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
 	/*
 	 * Destroy old process VM and create a new one (with a new stack)
 	 */
 	error = exec_new_vmspace(imgp, &aout_sysvec);
 
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error)
 		return (error);
 
 	/*
 	 * The vm space can be changed by exec_new_vmspace
 	 */
 	vmspace = imgp->proc->p_vmspace;
 
 	object = imgp->object;
 	map = &vmspace->vm_map;
 	vm_map_lock(map);
 	vm_object_reference(object);
 
 	text_end = virtual_offset + a_out->a_text;
 	error = vm_map_insert(map, object,
 		file_offset,
 		virtual_offset, text_end,
 		VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
 		MAP_COPY_ON_WRITE | MAP_PREFAULT);
 	if (error) {
 		vm_map_unlock(map);
 		vm_object_deallocate(object);
 		return (error);
 	}
 	data_end = text_end + a_out->a_data;
 	if (a_out->a_data) {
 		vm_object_reference(object);
 		error = vm_map_insert(map, object,
 			file_offset + a_out->a_text,
 			text_end, data_end,
 			VM_PROT_ALL, VM_PROT_ALL,
 			MAP_COPY_ON_WRITE | MAP_PREFAULT);
 		if (error) {
 			vm_map_unlock(map);
 			vm_object_deallocate(object);
 			return (error);
 		}
 	}
 
 	if (bss_size) {
 		error = vm_map_insert(map, NULL, 0,
 			data_end, data_end + bss_size,
 			VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error) {
 			vm_map_unlock(map);
 			return (error);
 		}
 	}
 	vm_map_unlock(map);
 
 	/* Fill in process VM information */
 	vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
 	vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
 	vmspace->vm_daddr = (caddr_t) (uintptr_t)
 			    (virtual_offset + a_out->a_text);
 
 	/* Fill in image_params */
 	imgp->interpreted = 0;
 	imgp->entry_addr = a_out->a_entry;
 
 	imgp->proc->p_sysent = &aout_sysvec;
 
 	return (0);
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw aout_execsw = {
 	.ex_imgact = exec_aout_imgact,
 	.ex_name = "a.out"
 };
 EXEC_SET(aout, aout_execsw);
Index: head/sys/kern/subr_witness.c
===================================================================
--- head/sys/kern/subr_witness.c	(revision 332488)
+++ head/sys/kern/subr_witness.c	(revision 332489)
@@ -1,3071 +1,3078 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2008 Isilon Systems, Inc.
  * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
  * Copyright (c) 1998 Berkeley Software Design, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Implementation of the `witness' lock verifier.  Originally implemented for
  * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
  * classes in FreeBSD.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 /*
  * Special rules concerning Giant and lock orders:
  *
  * 1) Giant must be acquired before any other mutexes.  Stated another way,
  *    no other mutex may be held when Giant is acquired.
  *
  * 2) Giant must be released when blocking on a sleepable lock.
  *
  * This rule is less obvious, but is a result of Giant providing the same
  * semantics as spl().  Basically, when a thread sleeps, it must release
  * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
  * 2).
  *
  * 3) Giant may be acquired before or after sleepable locks.
  *
  * This rule is also not quite as obvious.  Giant may be acquired after
  * a sleepable lock because it is a non-sleepable lock and non-sleepable
  * locks may always be acquired while holding a sleepable lock.  The second
  * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
  * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
  * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
  * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
  * execute.  Thus, acquiring Giant both before and after a sleepable lock
  * will not result in a lock order reversal.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_stack.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #if !defined(DDB) && !defined(STACK)
 #error "DDB or STACK options are required for WITNESS"
 #endif
 
 /* Note that these traces do not work with KTR_ALQ. */
 #if 0
 #define	KTR_WITNESS	KTR_SUBSYS
 #else
 #define	KTR_WITNESS	0
 #endif
 
 #define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
 #define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
 #define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
 
 /* Define this to check for blessed mutexes */
 #undef BLESSING
 
 #ifndef WITNESS_COUNT
 #define	WITNESS_COUNT 		1536
 #endif
 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
 #define	WITNESS_PENDLIST	(512 + (MAXCPU * 4))
 
 /* Allocate 256 KB of stack data space */
 #define	WITNESS_LO_DATA_COUNT	2048
 
 /* Prime, gives load factor of ~2 at full load */
 #define	WITNESS_LO_HASH_SIZE	1021
 
 /*
  * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
  * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
  * probably be safe for the most part, but it's still a SWAG.
  */
 #define	LOCK_NCHILDREN	5
 #define	LOCK_CHILDCOUNT	2048
 
 #define	MAX_W_NAME	64
 
 #define	FULLGRAPH_SBUF_SIZE	512
 
 /*
  * These flags go in the witness relationship matrix and describe the
  * relationship between any two struct witness objects.
  */
 #define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
 #define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
 #define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
 #define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
 #define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
 #define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
 #define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
 #define	WITNESS_RELATED_MASK						\
 	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
 #define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
 					  * observed. */
 #define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
 #define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
 #define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
 
 /* Descendant to ancestor flags */
 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
 
 /* Ancestor to descendant flags */
 #define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
 
 #define	WITNESS_INDEX_ASSERT(i)						\
 	MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
 
 static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
 
 /*
  * Lock instances.  A lock instance is the data associated with a lock while
  * it is held by witness.  For example, a lock instance will hold the
  * recursion count of a lock.  Lock instances are held in lists.  Spin locks
  * are held in a per-cpu list while sleep locks are held in per-thread list.
  */
 struct lock_instance {
 	struct lock_object	*li_lock;
 	const char		*li_file;
 	int			li_line;
 	u_int			li_flags;
 };
 
 /*
  * A simple list type used to build the list of locks held by a thread
  * or CPU.  We can't simply embed the list in struct lock_object since a
  * lock may be held by more than one thread if it is a shared lock.  Locks
  * are added to the head of the list, so we fill up each list entry from
  * "the back" logically.  To ease some of the arithmetic, we actually fill
  * in each list entry the normal way (children[0] then children[1], etc.) but
  * when we traverse the list we read children[count-1] as the first entry
  * down to children[0] as the final entry.
  */
 struct lock_list_entry {
 	struct lock_list_entry	*ll_next;
 	struct lock_instance	ll_children[LOCK_NCHILDREN];
 	u_int			ll_count;
 };
 
 /*
  * The main witness structure. One of these per named lock type in the system
  * (for example, "vnode interlock").
  */
 struct witness {
 	char  			w_name[MAX_W_NAME];
 	uint32_t 		w_index;  /* Index in the relationship matrix */
 	struct lock_class	*w_class;
 	STAILQ_ENTRY(witness) 	w_list;		/* List of all witnesses. */
 	STAILQ_ENTRY(witness) 	w_typelist;	/* Witnesses of a type. */
 	struct witness		*w_hash_next; /* Linked list in hash buckets. */
 	const char		*w_file; /* File where last acquired */
 	uint32_t 		w_line; /* Line where last acquired */
 	uint32_t 		w_refcount;
 	uint16_t 		w_num_ancestors; /* direct/indirect
 						  * ancestor count */
 	uint16_t 		w_num_descendants; /* direct/indirect
 						    * descendant count */
 	int16_t 		w_ddb_level;
 	unsigned		w_displayed:1;
 	unsigned		w_reversed:1;
 };
 
 STAILQ_HEAD(witness_list, witness);
 
 /*
  * The witness hash table. Keys are witness names (const char *), elements are
  * witness objects (struct witness *).
  */
 struct witness_hash {
 	struct witness	*wh_array[WITNESS_HASH_SIZE];
 	uint32_t	wh_size;
 	uint32_t	wh_count;
 };
 
 /*
  * Key type for the lock order data hash table.
  */
 struct witness_lock_order_key {
 	uint16_t	from;
 	uint16_t	to;
 };
 
 struct witness_lock_order_data {
 	struct stack			wlod_stack;
 	struct witness_lock_order_key	wlod_key;
 	struct witness_lock_order_data	*wlod_next;
 };
 
 /*
  * The witness lock order data hash table. Keys are witness index tuples
  * (struct witness_lock_order_key), elements are lock order data objects
  * (struct witness_lock_order_data). 
  */
 struct witness_lock_order_hash {
 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
 	u_int	wloh_size;
 	u_int	wloh_count;
 };
 
 #ifdef BLESSING
 struct witness_blessed {
 	const char	*b_lock1;
 	const char	*b_lock2;
 };
 #endif
 
 struct witness_pendhelp {
 	const char		*wh_type;
 	struct lock_object	*wh_lock;
 };
 
 struct witness_order_list_entry {
 	const char		*w_name;
 	struct lock_class	*w_class;
 };
 
 /*
  * Returns 0 if one of the locks is a spin lock and the other is not.
  * Returns 1 otherwise.
  */
 static __inline int
 witness_lock_type_equal(struct witness *w1, struct witness *w2)
 {
 
 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
 }
 
 static __inline int
 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
     const struct witness_lock_order_key *b)
 {
 
 	return (a->from == b->from && a->to == b->to);
 }
 
 static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
 		    const char *fname);
 static void	adopt(struct witness *parent, struct witness *child);
 #ifdef BLESSING
 static int	blessed(struct witness *, struct witness *);
 #endif
 static void	depart(struct witness *w);
 static struct witness	*enroll(const char *description,
 			    struct lock_class *lock_class);
 static struct lock_instance	*find_instance(struct lock_list_entry *list,
 				    const struct lock_object *lock);
 static int	isitmychild(struct witness *parent, struct witness *child);
 static int	isitmydescendant(struct witness *parent, struct witness *child);
 static void	itismychild(struct witness *parent, struct witness *child);
 static int	sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS);
 static void	witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
 #ifdef DDB
 static void	witness_ddb_compute_levels(void);
 static void	witness_ddb_display(int(*)(const char *fmt, ...));
 static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
 		    struct witness *, int indent);
 static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
 		    struct witness_list *list);
 static void	witness_ddb_level_descendants(struct witness *parent, int l);
 static void	witness_ddb_list(struct thread *td);
 #endif
 static void	witness_debugger(int cond, const char *msg);
 static void	witness_free(struct witness *m);
 static struct witness	*witness_get(void);
 static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
 static struct witness	*witness_hash_get(const char *key);
 static void	witness_hash_put(struct witness *w);
 static void	witness_init_hash_tables(void);
 static void	witness_increment_graph_generation(void);
 static void	witness_lock_list_free(struct lock_list_entry *lle);
 static struct lock_list_entry	*witness_lock_list_get(void);
 static int	witness_lock_order_add(struct witness *parent,
 		    struct witness *child);
 static int	witness_lock_order_check(struct witness *parent,
 		    struct witness *child);
 static struct witness_lock_order_data	*witness_lock_order_get(
 					    struct witness *parent,
 					    struct witness *child);
 static void	witness_list_lock(struct lock_instance *instance,
 		    int (*prnt)(const char *fmt, ...));
 static int	witness_output(const char *fmt, ...) __printflike(1, 2);
 static int	witness_voutput(const char *fmt, va_list ap) __printflike(1, 0);
 static void	witness_setflag(struct lock_object *lock, int flag, int set);
 
 static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL,
     "Witness Locking");
 
 /*
  * If set to 0, lock order checking is disabled.  If set to -1,
  * witness is completely disabled.  Otherwise witness performs full
  * lock order checking for all locks.  At runtime, lock order checking
  * may be toggled.  However, witness cannot be reenabled once it is
  * completely disabled.
  */
 static int witness_watch = 1;
 SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0,
     sysctl_debug_witness_watch, "I", "witness is watching lock operations");
 
 #ifdef KDB
 /*
  * When KDB is enabled and witness_kdb is 1, it will cause the system
  * to drop into kdebug() when:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 #ifdef WITNESS_KDB
 int	witness_kdb = 1;
 #else
 int	witness_kdb = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, "");
 #endif /* KDB */
 
 #if defined(DDB) || defined(KDB)
 /*
  * When DDB or KDB is enabled and witness_trace is 1, it will cause the system
  * to print a stack trace:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_trace = 1;
 SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, "");
 #endif /* DDB || KDB */
 
 #ifdef WITNESS_SKIPSPIN
 int	witness_skipspin = 1;
 #else
 int	witness_skipspin = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, "");
 
 int badstack_sbuf_size;
 
 int witness_count = WITNESS_COUNT;
 SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, 
     &witness_count, 0, "");
 
 /*
  * Output channel for witness messages.  By default we print to the console.
  */
 enum witness_channel {
 	WITNESS_CONSOLE,
 	WITNESS_LOG,
 	WITNESS_NONE,
 };
 
 static enum witness_channel witness_channel = WITNESS_CONSOLE;
 SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING |
     CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A",
     "Output channel for warnings");
 
 /*
  * Call this to print out the relations between locks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");
 
 /*
  * Call this to print out the witness faulty stacks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");
 
 static struct mtx w_mtx;
 
 /* w_list */
 static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
 static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
 
 /* w_typelist */
 static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
 static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
 
 /* lock list */
 static struct lock_list_entry *w_lock_list_free = NULL;
 static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
 static u_int pending_cnt;
 
 static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
 SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
     "");
 
 static struct witness *w_data;
 static uint8_t **w_rmatrix;
 static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
 static struct witness_hash w_hash;	/* The witness hash table. */
 
 /* The lock order data hash */
 static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
 static struct witness_lock_order_data *w_lofree = NULL;
 static struct witness_lock_order_hash w_lohash;
 static int w_max_used_index = 0;
 static unsigned int w_generation = 0;
 static const char w_notrunning[] = "Witness not running\n";
 static const char w_stillcold[] = "Witness is still cold\n";
+#ifdef __i386__
+static const char w_notallowed[] = "The sysctl is disabled on the arch\n";
+#endif
 
-
 static struct witness_order_list_entry order_lists[] = {
 	/*
 	 * sx locks
 	 */
 	{ "proctree", &lock_class_sx },
 	{ "allproc", &lock_class_sx },
 	{ "allprison", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * Various mutexes
 	 */
 	{ "Giant", &lock_class_mtx_sleep },
 	{ "pipe mutex", &lock_class_mtx_sleep },
 	{ "sigio lock", &lock_class_mtx_sleep },
 	{ "process group", &lock_class_mtx_sleep },
 	{ "process lock", &lock_class_mtx_sleep },
 	{ "session", &lock_class_mtx_sleep },
 	{ "uidinfo hash", &lock_class_rw },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-sleep", &lock_class_mtx_sleep },
 #endif
 	{ "time lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * umtx
 	 */
 	{ "umtx lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Sockets
 	 */
 	{ "accept", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "sellck", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Routing
 	 */
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "radix node head", &lock_class_rw },
 	{ "rtentry", &lock_class_mtx_sleep },
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv4 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "in_multi_mtx", &lock_class_mtx_sleep },
 	{ "igmp_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_lock", &lock_class_rw },
 	{ NULL, NULL },
 	/*
 	 * IPv6 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "in6_multi_mtx", &lock_class_mtx_sleep },
 	{ "mld_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_lock", &lock_class_rw },
 	{ NULL, NULL },
 	/*
 	 * UNIX Domain Sockets
 	 */
 	{ "unp_link_rwlock", &lock_class_rw },
 	{ "unp_list_lock", &lock_class_mtx_sleep },
 	{ "unp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UDP/IP
 	 */
 	{ "udp", &lock_class_rw },
 	{ "udpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * TCP/IP
 	 */
 	{ "tcp", &lock_class_rw },
 	{ "tcpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * BPF
 	 */
 	{ "bpf global lock", &lock_class_sx },
 	{ "bpf interface lock", &lock_class_rw },
 	{ "bpf cdev lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * NFS server
 	 */
 	{ "nfsd_mtx", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 
 	/*
 	 * IEEE 802.11
 	 */
 	{ "802.11 com lock", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 	/*
 	 * Network drivers
 	 */
 	{ "network driver", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 
 	/*
 	 * Netgraph
 	 */
 	{ "ng_node", &lock_class_mtx_sleep },
 	{ "ng_worklist", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * CDEV
 	 */
 	{ "vm map (system)", &lock_class_mtx_sleep },
 	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VM
 	 */
 	{ "vm map (user)", &lock_class_sx },
 	{ "vm object", &lock_class_rw },
 	{ "vm page", &lock_class_mtx_sleep },
 	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ "pmap pv global", &lock_class_rw },
 	{ "pmap", &lock_class_mtx_sleep },
 	{ "pmap pv list", &lock_class_rw },
 	{ "vm page free queue", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
 	 */
 	{ "kqueue", &lock_class_mtx_sleep },
 	{ "struct mount mtx", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VFS namecache
 	 */
 	{ "ncvn", &lock_class_mtx_sleep },
 	{ "ncbuc", &lock_class_rw },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "ncneg", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * ZFS locking
 	 */
 	{ "dn->dn_mtx", &lock_class_sx },
 	{ "dr->dt.di.dr_mtx", &lock_class_sx },
 	{ "db->db_mtx", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * TCP log locks
 	 */
 	{ "TCP ID tree", &lock_class_rw },
 	{ "tcp log id bucket", &lock_class_mtx_sleep },
 	{ "tcpinp", &lock_class_rw },
 	{ "TCP log expireq", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * spin locks
 	 */
 #ifdef SMP
 	{ "ap boot", &lock_class_mtx_spin },
 #endif
 	{ "rm.mutex_mtx", &lock_class_mtx_spin },
 	{ "sio", &lock_class_mtx_spin },
 #ifdef __i386__
 	{ "cy", &lock_class_mtx_spin },
 #endif
 #ifdef __sparc64__
 	{ "pcib_mtx", &lock_class_mtx_spin },
 	{ "rtc_mtx", &lock_class_mtx_spin },
 #endif
 	{ "scc_hwmtx", &lock_class_mtx_spin },
 	{ "uart_hwmtx", &lock_class_mtx_spin },
 	{ "fast_taskqueue", &lock_class_mtx_spin },
 	{ "intr table", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-per-proc", &lock_class_mtx_spin },
 #endif
 	{ "process slock", &lock_class_mtx_spin },
 	{ "syscons video lock", &lock_class_mtx_spin },
 	{ "sleepq chain", &lock_class_mtx_spin },
 	{ "rm_spinlock", &lock_class_mtx_spin },
 	{ "turnstile chain", &lock_class_mtx_spin },
 	{ "turnstile lock", &lock_class_mtx_spin },
 	{ "sched lock", &lock_class_mtx_spin },
 	{ "td_contested", &lock_class_mtx_spin },
 	{ "callout", &lock_class_mtx_spin },
 	{ "entropy harvest mutex", &lock_class_mtx_spin },
 #ifdef SMP
 	{ "smp rendezvous", &lock_class_mtx_spin },
 #endif
 #ifdef __powerpc__
 	{ "tlb0", &lock_class_mtx_spin },
 #endif
 	/*
 	 * leaf locks
 	 */
 	{ "intrcnt", &lock_class_mtx_spin },
 	{ "icu", &lock_class_mtx_spin },
 #if defined(SMP) && defined(__sparc64__)
 	{ "ipi", &lock_class_mtx_spin },
 #endif
 #ifdef __i386__
 	{ "allpmaps", &lock_class_mtx_spin },
 	{ "descriptor tables", &lock_class_mtx_spin },
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "cpuset", &lock_class_mtx_spin },
 	{ "mprof lock", &lock_class_mtx_spin },
 	{ "zombie lock", &lock_class_mtx_spin },
 	{ "ALD Queue", &lock_class_mtx_spin },
 #if defined(__i386__) || defined(__amd64__)
 	{ "pcicfg", &lock_class_mtx_spin },
 	{ "NDIS thread lock", &lock_class_mtx_spin },
 #endif
 	{ "tw_osl_io_lock", &lock_class_mtx_spin },
 	{ "tw_osl_q_lock", &lock_class_mtx_spin },
 	{ "tw_cl_io_lock", &lock_class_mtx_spin },
 	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
 	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-leaf", &lock_class_mtx_spin },
 #endif
 	{ "blocked lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
 
 #ifdef BLESSING
 /*
  * Pairs of locks which have been blessed
  * Don't complain about order problems with blessed locks
  */
 static struct witness_blessed blessed_list[] = {
 };
 #endif
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 /*
  * This global is set to 1 once the static lock orders have been enrolled
  * so that a warning can be issued for any spin locks enrolled later.
  */
 static int witness_spin_warn = 0;
 
 /* Trim useless garbage from filenames. */
 static const char *
 fixup_filename(const char *file)
 {
 
 	if (file == NULL)
 		return (NULL);
 	while (strncmp(file, "../", 3) == 0)
 		file += 3;
 	return (file);
 }
 
 /*
  * Calculate the size of early witness structures.
  */
 int
 witness_startup_count(void)
 {
 	int sz;
 
 	sz = sizeof(struct witness) * witness_count;
 	sz += sizeof(*w_rmatrix) * (witness_count + 1);
 	sz += sizeof(*w_rmatrix[0]) * (witness_count + 1) *
 	    (witness_count + 1);
 
 	return (sz);
 }
 
 /*
  * The WITNESS-enabled diagnostic code.  Note that the witness code does
  * assume that the early boot is single-threaded at least until after this
  * routine is completed.
  */
 void
 witness_startup(void *mem)
 {
 	struct lock_object *lock;
 	struct witness_order_list_entry *order;
 	struct witness *w, *w1;
 	uintptr_t p;
 	int i;
 
 	p = (uintptr_t)mem;
 	w_data = (void *)p;
 	p += sizeof(struct witness) * witness_count;
 
 	w_rmatrix = (void *)p;
 	p += sizeof(*w_rmatrix) * (witness_count + 1);
 
 	for (i = 0; i < witness_count + 1; i++) {
 		w_rmatrix[i] = (void *)p;
 		p += sizeof(*w_rmatrix[i]) * (witness_count + 1);
 	}
 	badstack_sbuf_size = witness_count * 256;
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
 	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
 	    MTX_NOWITNESS | MTX_NOPROFILE);
 	for (i = witness_count - 1; i >= 0; i--) {
 		w = &w_data[i];
 		memset(w, 0, sizeof(*w));
 		w_data[i].w_index = i;	/* Witness index never changes. */
 		witness_free(w);
 	}
 	KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
 	    ("%s: Invalid list of free witness objects", __func__));
 
 	/* Witness with index 0 is not used to aid in debugging. */
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 
 	for (i = 0; i < witness_count; i++) {
 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * 
 		    (witness_count + 1));
 	}
 
 	for (i = 0; i < LOCK_CHILDCOUNT; i++)
 		witness_lock_list_free(&w_locklistdata[i]);
 	witness_init_hash_tables();
 
 	/* First add in all the specified order lists. */
 	for (order = order_lists; order->w_name != NULL; order++) {
 		w = enroll(order->w_name, order->w_class);
 		if (w == NULL)
 			continue;
 		w->w_file = "order list";
 		for (order++; order->w_name != NULL; order++) {
 			w1 = enroll(order->w_name, order->w_class);
 			if (w1 == NULL)
 				continue;
 			w1->w_file = "order list";
 			itismychild(w, w1);
 			w = w1;
 		}
 	}
 	witness_spin_warn = 1;
 
 	/* Iterate through all locks and add them to witness. */
 	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
 		lock = pending_locks[i].wh_lock;
 		KASSERT(lock->lo_flags & LO_WITNESS,
 		    ("%s: lock %s is on pending list but not LO_WITNESS",
 		    __func__, lock->lo_name));
 		lock->lo_witness = enroll(pending_locks[i].wh_type,
 		    LOCK_CLASS(lock));
 	}
 
 	/* Mark the witness code as being ready for use. */
 	witness_cold = 0;
 
 	mtx_lock(&Giant);
 }
 
 void
 witness_init(struct lock_object *lock, const char *type)
 {
 	struct lock_class *class;
 
 	/* Various sanity checks. */
 	class = LOCK_CLASS(lock);
 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
 	    (class->lc_flags & LC_RECURSABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be recursable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 	    (class->lc_flags & LC_SLEEPABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be sleepable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
 	    (class->lc_flags & LC_UPGRADABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be upgradable",
 		    __func__, class->lc_name, lock->lo_name);
 
 	/*
 	 * If we shouldn't watch this lock, then just clear lo_witness.
 	 * Otherwise, if witness_cold is set, then it is too early to
 	 * enroll this lock, so defer it to witness_initialize() by adding
 	 * it to the pending_locks list.  If it is not too early, then enroll
 	 * the lock now.
 	 */
 	if (witness_watch < 1 || panicstr != NULL ||
 	    (lock->lo_flags & LO_WITNESS) == 0)
 		lock->lo_witness = NULL;
 	else if (witness_cold) {
 		pending_locks[pending_cnt].wh_lock = lock;
 		pending_locks[pending_cnt++].wh_type = type;
 		if (pending_cnt > WITNESS_PENDLIST)
 			panic("%s: pending locks list is too small, "
 			    "increase WITNESS_PENDLIST\n",
 			    __func__);
 	} else
 		lock->lo_witness = enroll(type, class);
 }
 
 void
 witness_destroy(struct lock_object *lock)
 {
 	struct lock_class *class;
 	struct witness *w;
 
 	class = LOCK_CLASS(lock);
 
 	if (witness_cold)
 		panic("lock (%s) %s destroyed while witness_cold",
 		    class->lc_name, lock->lo_name);
 
 	/* XXX: need to verify that no one holds the lock */
 	if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
 		return;
 	w = lock->lo_witness;
 
 	mtx_lock_spin(&w_mtx);
 	MPASS(w->w_refcount > 0);
 	w->w_refcount--;
 
 	if (w->w_refcount == 0)
 		depart(w);
 	mtx_unlock_spin(&w_mtx);
 }
 
 #ifdef DDB
 static void
 witness_ddb_compute_levels(void)
 {
 	struct witness *w;
 
 	/*
 	 * First clear all levels.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_ddb_level = -1;
 
 	/*
 	 * Look for locks with no parents and level all their descendants.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list) {
 
 		/* If the witness has ancestors (is not a root), skip it. */
 		if (w->w_num_ancestors > 0)
 			continue;
 		witness_ddb_level_descendants(w, 0);
 	}
 }
 
 static void
 witness_ddb_level_descendants(struct witness *w, int l)
 {
 	int i;
 
 	if (w->w_ddb_level >= l)
 		return;
 
 	w->w_ddb_level = l;
 	l++;
 
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_level_descendants(&w_data[i], l);
 	}
 }
 
 static void
 witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
     struct witness *w, int indent)
 {
 	int i;
 
  	for (i = 0; i < indent; i++)
  		prnt(" ");
 	prnt("%s (type: %s, depth: %d, active refs: %d)",
 	     w->w_name, w->w_class->lc_name,
 	     w->w_ddb_level, w->w_refcount);
  	if (w->w_displayed) {
  		prnt(" -- (already displayed)\n");
  		return;
  	}
  	w->w_displayed = 1;
 	if (w->w_file != NULL && w->w_line != 0)
 		prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
 		    w->w_line);
 	else
 		prnt(" -- never acquired\n");
 	indent++;
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (db_pager_quit)
 			return;
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_display_descendants(prnt, &w_data[i],
 			    indent);
 	}
 }
 
 static void
 witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
     struct witness_list *list)
 {
 	struct witness *w;
 
 	STAILQ_FOREACH(w, list, w_typelist) {
 		if (w->w_file == NULL || w->w_ddb_level > 0)
 			continue;
 
 		/* This lock has no anscestors - display its descendants. */
 		witness_ddb_display_descendants(prnt, w, 0);
 		if (db_pager_quit)
 			return;
 	}
 }
 	
 static void
 witness_ddb_display(int(*prnt)(const char *fmt, ...))
 {
 	struct witness *w;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	witness_ddb_compute_levels();
 
 	/* Clear all the displayed flags. */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 
 	/*
 	 * First, handle sleep locks which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep locks:\n");
 	witness_ddb_display_list(prnt, &w_sleep);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Now do spin locks which have been acquired at least once.
 	 */
 	prnt("\nSpin locks:\n");
 	witness_ddb_display_list(prnt, &w_spin);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Finally, any locks which have not been acquired yet.
 	 */
 	prnt("\nLocks which were never acquired:\n");
 	STAILQ_FOREACH(w, &w_all, w_list) {
 		if (w->w_file != NULL || w->w_refcount == 0)
 			continue;
 		prnt("%s (type: %s, depth: %d)\n", w->w_name,
 		    w->w_class->lc_name, w->w_ddb_level);
 		if (db_pager_quit)
 			return;
 	}
 }
 #endif /* DDB */
 
 int
 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
 {
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (0);
 
 	/* Require locks that witness knows about. */
 	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
 	    lock2->lo_witness == NULL)
 		return (EINVAL);
 
 	mtx_assert(&w_mtx, MA_NOTOWNED);
 	mtx_lock_spin(&w_mtx);
 
 	/*
 	 * If we already have either an explicit or implied lock order that
 	 * is the other way around, then return an error.
 	 */
 	if (witness_watch &&
 	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
 		mtx_unlock_spin(&w_mtx);
 		return (EDOOFUS);
 	}
 	
 	/* Try to add the new order. */
 	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 	    lock2->lo_witness->w_name, lock1->lo_witness->w_name);
 	itismychild(lock1->lo_witness, lock2->lo_witness);
 	mtx_unlock_spin(&w_mtx);
 	return (0);
 }
 
 void
 witness_checkorder(struct lock_object *lock, int flags, const char *file,
     int line, struct lock_object *interlock)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1, *lock2, *plock;
 	struct lock_class *class, *iclass;
 	struct witness *w, *w1;
 	struct thread *td;
 	int i, j;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 
 	w = lock->lo_witness;
 	class = LOCK_CLASS(lock);
 	td = curthread;
 
 	if (class->lc_flags & LC_SLEEPLOCK) {
 
 		/*
 		 * Since spin locks include a critical section, this check
 		 * implicitly enforces a lock order of all sleep locks before
 		 * all spin locks.
 		 */
 		if (td->td_critnest != 0 && !kdb_active)
 			kassert_panic("acquiring blockable sleep lock with "
 			    "spinlock or critical section held (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 
 		/*
 		 * If this is the first lock acquired then just return as
 		 * no order checking is needed.
 		 */
 		lock_list = td->td_sleeplocks;
 		if (lock_list == NULL || lock_list->ll_count == 0)
 			return;
 	} else {
 
 		/*
 		 * If this is the first lock, just return as no order
 		 * checking is needed.  Avoid problems with thread
 		 * migration pinning the thread while checking if
 		 * spinlocks are held.  If at least one spinlock is held
 		 * the thread is in a safe path and it is allowed to
 		 * unpin it.
 		 */
 		sched_pin();
 		lock_list = PCPU_GET(spinlocks);
 		if (lock_list == NULL || lock_list->ll_count == 0) {
 			sched_unpin();
 			return;
 		}
 		sched_unpin();
 	}
 
 	/*
 	 * Check to see if we are recursing on a lock we already own.  If
 	 * so, make sure that we don't mismatch exclusive and shared lock
 	 * acquires.
 	 */
 	lock1 = find_instance(lock_list, lock);
 	if (lock1 != NULL) {
 		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
 		    (flags & LOP_EXCLUSIVE) == 0) {
 			witness_output("shared lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while exclusively locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("excl->share");
 		}
 		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
 		    (flags & LOP_EXCLUSIVE) != 0) {
 			witness_output("exclusive lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while share locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("share->excl");
 		}
 		return;
 	}
 
 	/* Warn if the interlock is not locked exactly once. */
 	if (interlock != NULL) {
 		iclass = LOCK_CLASS(interlock);
 		lock1 = find_instance(lock_list, interlock);
 		if (lock1 == NULL)
 			kassert_panic("interlock (%s) %s not locked @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 		else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("interlock (%s) %s recursed @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 	}
 
 	/*
 	 * Find the previously acquired lock, but ignore interlocks.
 	 */
 	plock = &lock_list->ll_children[lock_list->ll_count - 1];
 	if (interlock != NULL && plock->li_lock == interlock) {
 		if (lock_list->ll_count > 1)
 			plock =
 			    &lock_list->ll_children[lock_list->ll_count - 2];
 		else {
 			lle = lock_list->ll_next;
 
 			/*
 			 * The interlock is the only lock we hold, so
 			 * simply return.
 			 */
 			if (lle == NULL)
 				return;
 			plock = &lle->ll_children[lle->ll_count - 1];
 		}
 	}
 	
 	/*
 	 * Try to perform most checks without a lock.  If this succeeds we
 	 * can skip acquiring the lock and return success.  Otherwise we redo
 	 * the check with the lock held to handle races with concurrent updates.
 	 */
 	w1 = plock->li_lock->lo_witness;
 	if (witness_lock_order_check(w1, w))
 		return;
 
 	mtx_lock_spin(&w_mtx);
 	if (witness_lock_order_check(w1, w)) {
 		mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	witness_lock_order_add(w1, w);
 
 	/*
 	 * Check for duplicate locks of the same type.  Note that we only
 	 * have to check for this on the last lock we just acquired.  Any
 	 * other cases will be caught as lock order violations.
 	 */
 	if (w1 == w) {
 		i = w->w_index;
 		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
 		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
 		    w_rmatrix[i][i] |= WITNESS_REVERSAL;
 			w->w_reversed = 1;
 			mtx_unlock_spin(&w_mtx);
 			witness_output(
 			    "acquiring duplicate lock of same type: \"%s\"\n", 
 			    w->w_name);
 			witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
 			    fixup_filename(plock->li_file), plock->li_line);
 			witness_output(" 2nd %s @ %s:%d\n", lock->lo_name,
 			    fixup_filename(file), line);
 			witness_debugger(1, __func__);
 		} else
 			mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	mtx_assert(&w_mtx, MA_OWNED);
 
 	/*
 	 * If we know that the lock we are acquiring comes after
 	 * the lock we most recently acquired in the lock order tree,
 	 * then there is no need for any further checks.
 	 */
 	if (isitmychild(w1, w))
 		goto out;
 
 	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
 		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
 
 			MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
 			lock1 = &lle->ll_children[i];
 
 			/*
 			 * Ignore the interlock.
 			 */
 			if (interlock == lock1->li_lock)
 				continue;
 
 			/*
 			 * If this lock doesn't undergo witness checking,
 			 * then skip it.
 			 */
 			w1 = lock1->li_lock->lo_witness;
 			if (w1 == NULL) {
 				KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
 				    ("lock missing witness structure"));
 				continue;
 			}
 
 			/*
 			 * If we are locking Giant and this is a sleepable
 			 * lock, then skip it.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * is Giant, then skip it.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * isn't sleepable, we want to treat it as a lock
 			 * order violation to enfore a general lock order of
 			 * sleepable locks before non-sleepable locks.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				goto reversal;
 
 			/*
 			 * If we are locking Giant and this is a non-sleepable
 			 * lock, then treat it as a reversal.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
 			    lock == &Giant.lock_object)
 				goto reversal;
 
 			/*
 			 * Check the lock order hierarchy for a reveresal.
 			 */
 			if (!isitmydescendant(w, w1))
 				continue;
 		reversal:
 
 			/*
 			 * We have a lock order violation, check to see if it
 			 * is allowed or has already been yelled about.
 			 */
 #ifdef BLESSING
 
 			/*
 			 * If the lock order is blessed, just bail.  We don't
 			 * look for other lock order violations though, which
 			 * may be a bug.
 			 */
 			if (blessed(w, w1))
 				goto out;
 #endif
 
 			/* Bail if this violation is known */
 			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
 				goto out;
 
 			/* Record this as a violation */
 			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
 			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
 			w->w_reversed = w1->w_reversed = 1;
 			witness_increment_graph_generation();
 			mtx_unlock_spin(&w_mtx);
 
 #ifdef WITNESS_NO_VNODE
 			/*
 			 * There are known LORs between VNODE locks. They are
 			 * not an indication of a bug. VNODE locks are flagged
 			 * as such (LO_IS_VNODE) and we don't yell if the LOR
 			 * is between 2 VNODE locks.
 			 */
 			if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
 				return;
 #endif
 
 			/*
 			 * Ok, yell about it.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				witness_output(
 		"lock order reversal: (sleepable after non-sleepable)\n");
 			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
 			    && lock == &Giant.lock_object)
 				witness_output(
 		"lock order reversal: (Giant after non-sleepable)\n");
 			else
 				witness_output("lock order reversal:\n");
 
 			/*
 			 * Try to locate an earlier lock with
 			 * witness w in our list.
 			 */
 			do {
 				lock2 = &lle->ll_children[i];
 				MPASS(lock2->li_lock != NULL);
 				if (lock2->li_lock->lo_witness == w)
 					break;
 				if (i == 0 && lle->ll_next != NULL) {
 					lle = lle->ll_next;
 					i = lle->ll_count - 1;
 					MPASS(i >= 0 && i < LOCK_NCHILDREN);
 				} else
 					i--;
 			} while (i >= 0);
 			if (i < 0) {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			} else {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock2->li_lock, lock2->li_lock->lo_name,
 				    lock2->li_lock->lo_witness->w_name,
 				    fixup_filename(lock2->li_file),
 				    lock2->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			}
 			witness_debugger(1, __func__);
 			return;
 		}
 	}
 
 	/*
 	 * If requested, build a new lock order.  However, don't build a new
 	 * relationship between a sleepable lock and Giant if it is in the
 	 * wrong direction.  The correct lock order is that sleepable locks
 	 * always come before Giant.
 	 */
 	if (flags & LOP_NEWORDER &&
 	    !(plock->li_lock == &Giant.lock_object &&
 	    (lock->lo_flags & LO_SLEEPABLE) != 0)) {
 		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 		    w->w_name, plock->li_lock->lo_witness->w_name);
 		itismychild(plock->li_lock->lo_witness, w);
 	}
 out:
 	mtx_unlock_spin(&w_mtx);
 }
 
 void
 witness_lock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct witness *w;
 	struct thread *td;
 
 	if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 	w = lock->lo_witness;
 	td = curthread;
 
 	/* Determine lock list for this lock. */
 	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 
 	/* Check to see if we are recursing on a lock we already own. */
 	instance = find_instance(*lock_list, lock);
 	if (instance != NULL) {
 		instance->li_flags++;
 		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, lock->lo_name,
 		    instance->li_flags & LI_RECURSEMASK);
 		instance->li_file = file;
 		instance->li_line = line;
 		return;
 	}
 
 	/* Update per-witness last file and line acquire. */
 	w->w_file = file;
 	w->w_line = line;
 
 	/* Find the next open lock instance in the list and fill it. */
 	lle = *lock_list;
 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
 		lle = witness_lock_list_get();
 		if (lle == NULL)
 			return;
 		lle->ll_next = *lock_list;
 		CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		*lock_list = lle;
 	}
 	instance = &lle->ll_children[lle->ll_count++];
 	instance->li_lock = lock;
 	instance->li_line = line;
 	instance->li_file = file;
 	if ((flags & LOP_EXCLUSIVE) != 0)
 		instance->li_flags = LI_EXCLUSIVE;
 	else
 		instance->li_flags = 0;
 	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
 	    td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
 }
 
 void
 witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "upgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "upgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "upgrade of exclusive lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "upgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags |= LI_EXCLUSIVE;
 }
 
 void
 witness_downgrade(struct lock_object *lock, int flags, const char *file,
     int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "downgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "downgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "downgrade of shared lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "downgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags &= ~LI_EXCLUSIVE;
 }
 
 void
 witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct lock_class *class;
 	struct thread *td;
 	register_t s;
 	int i, j;
 
 	if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
 		return;
 	td = curthread;
 	class = LOCK_CLASS(lock);
 
 	/* Find lock instance associated with this lock. */
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 	lle = *lock_list;
 	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
 		for (i = 0; i < (*lock_list)->ll_count; i++) {
 			instance = &(*lock_list)->ll_children[i];
 			if (instance->li_lock == lock)
 				goto found;
 		}
 
 	/*
 	 * When disabling WITNESS through witness_watch we could end up in
 	 * having registered locks in the td_sleeplocks queue.
 	 * We have to make sure we flush these queues, so just search for
 	 * eventual register locks and remove them.
 	 */
 	if (witness_watch > 0) {
 		kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
 		    lock->lo_name, fixup_filename(file), line);
 		return;
 	} else {
 		return;
 	}
 found:
 
 	/* First, check for shared/exclusive mismatches. */
 	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) == 0) {
 		witness_output("shared unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while exclusively locked from %s:%d\n",
 		    fixup_filename(instance->li_file), instance->li_line);
 		kassert_panic("excl->ushare");
 	}
 	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) != 0) {
 		witness_output("exclusive unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while share locked from %s:%d\n",
 		    fixup_filename(instance->li_file),
 		    instance->li_line);
 		kassert_panic("share->uexcl");
 	}
 	/* If we are recursed, unrecurse. */
 	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
 		CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, instance->li_lock->lo_name,
 		    instance->li_flags);
 		instance->li_flags--;
 		return;
 	}
 	/* The lock is now being dropped, check for NORELEASE flag */
 	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
 		witness_output("forbidden unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		kassert_panic("lock marked norelease");
 	}
 
 	/* Otherwise, remove this item from the list. */
 	s = intr_disable();
 	CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
 	    td->td_proc->p_pid, instance->li_lock->lo_name,
 	    (*lock_list)->ll_count - 1);
 	for (j = i; j < (*lock_list)->ll_count - 1; j++)
 		(*lock_list)->ll_children[j] =
 		    (*lock_list)->ll_children[j + 1];
 	(*lock_list)->ll_count--;
 	intr_restore(s);
 
 	/*
 	 * In order to reduce contention on w_mtx, we want to keep always an
 	 * head object into lists so that frequent allocation from the 
 	 * free witness pool (and subsequent locking) is avoided.
 	 * In order to maintain the current code simple, when the head
 	 * object is totally unloaded it means also that we do not have
 	 * further objects in the list, so the list ownership needs to be
 	 * hand over to another object if the current head needs to be freed.
 	 */
 	if ((*lock_list)->ll_count == 0) {
 		if (*lock_list == lle) {
 			if (lle->ll_next == NULL)
 				return;
 		} else
 			lle = *lock_list;
 		*lock_list = lle->ll_next;
 		CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		witness_lock_list_free(lle);
 	}
 }
 
 void
 witness_thread_exit(struct thread *td)
 {
 	struct lock_list_entry *lle;
 	int i, n;
 
 	lle = td->td_sleeplocks;
 	if (lle == NULL || panicstr != NULL)
 		return;
 	if (lle->ll_count != 0) {
 		for (n = 0; lle != NULL; lle = lle->ll_next)
 			for (i = lle->ll_count - 1; i >= 0; i--) {
 				if (n == 0)
 					witness_output(
 		    "Thread %p exiting with the following locks held:\n", td);
 				n++;
 				witness_list_lock(&lle->ll_children[i],
 				    witness_output);
 				
 			}
 		kassert_panic(
 		    "Thread %p cannot exit while holding sleeplocks\n", td);
 	}
 	witness_lock_list_free(lle);
 }
 
 /*
  * Warn if any locks other than 'lock' are held.  Flags can be passed in to
  * exempt Giant and sleepable locks from the checks as well.  If any
  * non-exempt locks are held, then a supplied message is printed to the
  * output channel along with a list of the offending locks.  If indicated in the
  * flags then a failure results in a panic as well.
  */
 int
 witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1;
 	struct thread *td;
 	va_list ap;
 	int i, n;
 
 	if (witness_cold || witness_watch < 1 || panicstr != NULL)
 		return (0);
 	n = 0;
 	td = curthread;
 	for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			lock1 = &lle->ll_children[i];
 			if (lock1->li_lock == lock)
 				continue;
 			if (flags & WARN_GIANTOK &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 			if (flags & WARN_SLEEPOK &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
 				continue;
 			if (n == 0) {
 				va_start(ap, fmt);
 				vprintf(fmt, ap);
 				va_end(ap);
 				printf(" with the following %slocks held:\n",
 				    (flags & WARN_SLEEPOK) != 0 ?
 				    "non-sleepable " : "");
 			}
 			n++;
 			witness_list_lock(lock1, printf);
 		}
 
 	/*
 	 * Pin the thread in order to avoid problems with thread migration.
 	 * Once that all verifies are passed about spinlocks ownership,
 	 * the thread is in a safe path and it can be unpinned.
 	 */
 	sched_pin();
 	lock_list = PCPU_GET(spinlocks);
 	if (lock_list != NULL && lock_list->ll_count != 0) {
 		sched_unpin();
 
 		/*
 		 * We should only have one spinlock and as long as
 		 * the flags cannot match for this locks class,
 		 * check if the first spinlock is the one curthread
 		 * should hold.
 		 */
 		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
 		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
 		    lock1->li_lock == lock && n == 0)
 			return (0);
 
 		va_start(ap, fmt);
 		vprintf(fmt, ap);
 		va_end(ap);
 		printf(" with the following %slocks held:\n",
 		    (flags & WARN_SLEEPOK) != 0 ?  "non-sleepable " : "");
 		n += witness_list_locks(&lock_list, printf);
 	} else
 		sched_unpin();
 	if (flags & WARN_PANIC && n)
 		kassert_panic("%s", __func__);
 	else
 		witness_debugger(n, __func__);
 	return (n);
 }
 
 const char *
 witness_file(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return ("?");
 	w = lock->lo_witness;
 	return (w->w_file);
 }
 
 int
 witness_line(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return (0);
 	w = lock->lo_witness;
 	return (w->w_line);
 }
 
 static struct witness *
 enroll(const char *description, struct lock_class *lock_class)
 {
 	struct witness *w;
 
 	MPASS(description != NULL);
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (NULL);
 	if ((lock_class->lc_flags & LC_SPINLOCK)) {
 		if (witness_skipspin)
 			return (NULL);
 	} else if ((lock_class->lc_flags & LC_SLEEPLOCK) == 0) {
 		kassert_panic("lock class %s is not sleep or spin",
 		    lock_class->lc_name);
 		return (NULL);
 	}
 
 	mtx_lock_spin(&w_mtx);
 	w = witness_hash_get(description);
 	if (w)
 		goto found;
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	MPASS(strlen(description) < MAX_W_NAME);
 	strcpy(w->w_name, description);
 	w->w_class = lock_class;
 	w->w_refcount = 1;
 	STAILQ_INSERT_HEAD(&w_all, w, w_list);
 	if (lock_class->lc_flags & LC_SPINLOCK) {
 		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
 		w_spin_cnt++;
 	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
 		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
 		w_sleep_cnt++;
 	}
 
 	/* Insert new witness into the hash */
 	witness_hash_put(w);
 	witness_increment_graph_generation();
 	mtx_unlock_spin(&w_mtx);
 	return (w);
 found:
 	w->w_refcount++;
 	if (w->w_refcount == 1)
 		w->w_class = lock_class;
 	mtx_unlock_spin(&w_mtx);
 	if (lock_class != w->w_class)
 		kassert_panic(
 		    "lock (%s) %s does not match earlier (%s) lock",
 		    description, lock_class->lc_name,
 		    w->w_class->lc_name);
 	return (w);
 }
 
 static void
 depart(struct witness *w)
 {
 
 	MPASS(w->w_refcount == 0);
 	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
 		w_sleep_cnt--;
 	} else {
 		w_spin_cnt--;
 	}
 	/*
 	 * Set file to NULL as it may point into a loadable module.
 	 */
 	w->w_file = NULL;
 	w->w_line = 0;
 	witness_increment_graph_generation();
 }
 
 
 static void
 adopt(struct witness *parent, struct witness *child)
 {
 	int pi, ci, i, j;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	/* If the relationship is already known, there's no work to be done. */
 	if (isitmychild(parent, child))
 		return;
 
 	/* When the structure of the graph changes, bump up the generation. */
 	witness_increment_graph_generation();
 
 	/*
 	 * The hard part ... create the direct relationship, then propagate all
 	 * indirect relationships.
 	 */
 	pi = parent->w_index;
 	ci = child->w_index;
 	WITNESS_INDEX_ASSERT(pi);
 	WITNESS_INDEX_ASSERT(ci);
 	MPASS(pi != ci);
 	w_rmatrix[pi][ci] |= WITNESS_PARENT;
 	w_rmatrix[ci][pi] |= WITNESS_CHILD;
 
 	/*
 	 * If parent was not already an ancestor of child,
 	 * then we increment the descendant and ancestor counters.
 	 */
 	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
 		parent->w_num_descendants++;
 		child->w_num_ancestors++;
 	}
 
 	/* 
 	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as 
 	 * an ancestor of 'pi' during this loop.
 	 */
 	for (i = 1; i <= w_max_used_index; i++) {
 		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && 
 		    (i != pi))
 			continue;
 
 		/* Find each descendant of 'i' and mark it as a descendant. */
 		for (j = 1; j <= w_max_used_index; j++) {
 
 			/* 
 			 * Skip children that are already marked as
 			 * descendants of 'i'.
 			 */
 			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
 				continue;
 
 			/*
 			 * We are only interested in descendants of 'ci'. Note
 			 * that 'ci' itself is counted as a descendant of 'ci'.
 			 */
 			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && 
 			    (j != ci))
 				continue;
 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
 			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
 			w_data[i].w_num_descendants++;
 			w_data[j].w_num_ancestors++;
 
 			/* 
 			 * Make sure we aren't marking a node as both an
 			 * ancestor and descendant. We should have caught 
 			 * this as a lock order reversal earlier.
 			 */
 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    i, j, w_rmatrix[i][j]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    j, i, w_rmatrix[j][i]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 		}
 	}
 }
 
 static void
 itismychild(struct witness *parent, struct witness *child)
 {
 	int unlocked;
 
 	MPASS(child != NULL && parent != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (!witness_lock_type_equal(parent, child)) {
 		if (witness_cold == 0) {
 			unlocked = 1;
 			mtx_unlock_spin(&w_mtx);
 		} else {
 			unlocked = 0;
 		}
 		kassert_panic(
 		    "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
 		    "the same lock type", __func__, parent->w_name,
 		    parent->w_class->lc_name, child->w_name,
 		    child->w_class->lc_name);
 		if (unlocked)
 			mtx_lock_spin(&w_mtx);
 	}
 	adopt(parent, child);
 }
 
 /*
  * Generic code for the isitmy*() functions. The rmask parameter is the
  * expected relationship of w1 to w2.
  */
 static int
 _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
 {
 	unsigned char r1, r2;
 	int i1, i2;
 
 	i1 = w1->w_index;
 	i2 = w2->w_index;
 	WITNESS_INDEX_ASSERT(i1);
 	WITNESS_INDEX_ASSERT(i2);
 	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
 	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
 
 	/* The flags on one better be the inverse of the flags on the other */
 	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
 	    (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
 		/* Don't squawk if we're potentially racing with an update. */
 		if (!mtx_owned(&w_mtx))
 			return (0);
 		printf("%s: rmatrix mismatch between %s (index %d) and %s "
 		    "(index %d): w_rmatrix[%d][%d] == %hhx but "
 		    "w_rmatrix[%d][%d] == %hhx\n",
 		    fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
 		    i2, i1, r2);
 		kdb_backtrace();
 		printf("Witness disabled.\n");
 		witness_watch = -1;
 	}
 	return (r1 & rmask);
 }
 
 /*
  * Checks if @child is a direct child of @parent.
  */
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 
 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
 }
 
 /*
  * Checks if @descendant is a direct or inderect descendant of @ancestor.
  */
 static int
 isitmydescendant(struct witness *ancestor, struct witness *descendant)
 {
 
 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
 	    __func__));
 }
 
 #ifdef BLESSING
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < nitems(blessed_list); i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_name, b->b_lock1) == 0) {
 			if (strcmp(w2->w_name, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_name, b->b_lock2) == 0)
 			if (strcmp(w2->w_name, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 #endif
 
 static struct witness *
 witness_get(void)
 {
 	struct witness *w;
 	int index;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (witness_watch == -1) {
 		mtx_unlock_spin(&w_mtx);
 		return (NULL);
 	}
 	if (STAILQ_EMPTY(&w_free)) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("WITNESS: unable to allocate a new witness object\n");
 		return (NULL);
 	}
 	w = STAILQ_FIRST(&w_free);
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 	index = w->w_index;
 	MPASS(index > 0 && index == w_max_used_index+1 &&
 	    index < witness_count);
 	bzero(w, sizeof(*w));
 	w->w_index = index;
 	if (index > w_max_used_index)
 		w_max_used_index = index;
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 
 	STAILQ_INSERT_HEAD(&w_free, w, w_list);
 	w_free_cnt++;
 }
 
 static struct lock_list_entry *
 witness_lock_list_get(void)
 {
 	struct lock_list_entry *lle;
 
 	if (witness_watch == -1)
 		return (NULL);
 	mtx_lock_spin(&w_mtx);
 	lle = w_lock_list_free;
 	if (lle == NULL) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("%s: witness exhausted\n", __func__);
 		return (NULL);
 	}
 	w_lock_list_free = lle->ll_next;
 	mtx_unlock_spin(&w_mtx);
 	bzero(lle, sizeof(*lle));
 	return (lle);
 }
 		
 static void
 witness_lock_list_free(struct lock_list_entry *lle)
 {
 
 	mtx_lock_spin(&w_mtx);
 	lle->ll_next = w_lock_list_free;
 	w_lock_list_free = lle;
 	mtx_unlock_spin(&w_mtx);
 }
 
 static struct lock_instance *
 find_instance(struct lock_list_entry *list, const struct lock_object *lock)
 {
 	struct lock_list_entry *lle;
 	struct lock_instance *instance;
 	int i;
 
 	for (lle = list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			instance = &lle->ll_children[i];
 			if (instance->li_lock == lock)
 				return (instance);
 		}
 	return (NULL);
 }
 
 static void
 witness_list_lock(struct lock_instance *instance,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_object *lock;
 
 	lock = instance->li_lock;
 	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
 	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	if (lock->lo_witness->w_name != lock->lo_name)
 		prnt(" (%s)", lock->lo_witness->w_name);
 	prnt(" r = %d (%p) locked @ %s:%d\n",
 	    instance->li_flags & LI_RECURSEMASK, lock,
 	    fixup_filename(instance->li_file), instance->li_line);
 }
 
 static int
 witness_output(const char *fmt, ...)
 {
 	va_list ap;
 	int ret;
 
 	va_start(ap, fmt);
 	ret = witness_voutput(fmt, ap);
 	va_end(ap);
 	return (ret);
 }
 
 static int
 witness_voutput(const char *fmt, va_list ap)
 {
 	int ret;
 
 	ret = 0;
 	switch (witness_channel) {
 	case WITNESS_CONSOLE:
 		ret = vprintf(fmt, ap);
 		break;
 	case WITNESS_LOG:
 		vlog(LOG_NOTICE, fmt, ap);
 		break;
 	case WITNESS_NONE:
 		break;
 	}
 	return (ret);
 }
 
 #ifdef DDB
 static int
 witness_thread_has_locks(struct thread *td)
 {
 
 	if (td->td_sleeplocks == NULL)
 		return (0);
 	return (td->td_sleeplocks->ll_count != 0);
 }
 
 static int
 witness_proc_has_locks(struct proc *p)
 {
 	struct thread *td;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (witness_thread_has_locks(td))
 			return (1);
 	}
 	return (0);
 }
 #endif
 
 int
 witness_list_locks(struct lock_list_entry **lock_list,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_list_entry *lle;
 	int i, nheld;
 
 	nheld = 0;
 	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			witness_list_lock(&lle->ll_children[i], prnt);
 			nheld++;
 		}
 	return (nheld);
 }
 
 /*
  * This is a bit risky at best.  We call this function when we have timed
  * out acquiring a spin lock, and we assume that the other CPU is stuck
  * with this lock held.  So, we go groveling around in the other CPU's
  * per-cpu data to try to find the lock instance for this spin lock to
  * see when it was last acquired.
  */
 void
 witness_display_spinlock(struct lock_object *lock, struct thread *owner,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_instance *instance;
 	struct pcpu *pc;
 
 	if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
 		return;
 	pc = pcpu_find(owner->td_oncpu);
 	instance = find_instance(pc->pc_spinlocks, lock);
 	if (instance != NULL)
 		witness_list_lock(instance, prnt);
 }
 
 void
 witness_save(struct lock_object *lock, const char **filep, int *linep)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	*filep = instance->li_file;
 	*linep = instance->li_line;
 }
 
 void
 witness_restore(struct lock_object *lock, const char *file, int line)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 	lock->lo_witness->w_file = file;
 	lock->lo_witness->w_line = line;
 	if (instance == NULL)
 		return;
 	instance->li_file = file;
 	instance->li_line = line;
 }
 
 void
 witness_assert(const struct lock_object *lock, int flags, const char *file,
     int line)
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
 		instance = find_instance(curthread->td_sleeplocks, lock);
 	else if ((class->lc_flags & LC_SPINLOCK) != 0)
 		instance = find_instance(PCPU_GET(spinlocks), lock);
 	else {
 		kassert_panic("Lock (%s) %s is not sleep or spin!",
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	switch (flags) {
 	case LA_UNLOCKED:
 		if (instance != NULL)
 			kassert_panic("Lock (%s) %s locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	case LA_LOCKED:
 	case LA_LOCKED | LA_RECURSED:
 	case LA_LOCKED | LA_NOTRECURSED:
 	case LA_SLOCKED:
 	case LA_SLOCKED | LA_RECURSED:
 	case LA_SLOCKED | LA_NOTRECURSED:
 	case LA_XLOCKED:
 	case LA_XLOCKED | LA_RECURSED:
 	case LA_XLOCKED | LA_NOTRECURSED:
 		if (instance == NULL) {
 			kassert_panic("Lock (%s) %s not locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			break;
 		}
 		if ((flags & LA_XLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "Lock (%s) %s not exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_SLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "Lock (%s) %s exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_RECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) == 0)
 			kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_NOTRECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("Lock (%s) %s recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	default:
 		kassert_panic("Invalid lock assertion at %s:%d.",
 		    fixup_filename(file), line);
 
 	}
 #endif	/* INVARIANT_SUPPORT */
 }
 
 static void
 witness_setflag(struct lock_object *lock, int flag, int set)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 
 	if (set)
 		instance->li_flags |= flag;
 	else
 		instance->li_flags &= ~flag;
 }
 
 void
 witness_norelease(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 1);
 }
 
 void
 witness_releaseok(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 0);
 }
 
 #ifdef DDB
 static void
 witness_ddb_list(struct thread *td)
 {
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	KASSERT(kdb_active, ("%s: not in the debugger", __func__));
 
 	if (witness_watch < 1)
 		return;
 
 	witness_list_locks(&td->td_sleeplocks, db_printf);
 
 	/*
 	 * We only handle spinlocks if td == curthread.  This is somewhat broken
 	 * if td is currently executing on some other CPU and holds spin locks
 	 * as we won't display those locks.  If we had a MI way of getting
 	 * the per-cpu data for a given cpu then we could use
 	 * td->td_oncpu to get the list of spinlocks for this thread
 	 * and "fix" this.
 	 *
 	 * That still wouldn't really fix this unless we locked the scheduler
 	 * lock or stopped the other CPU to make sure it wasn't changing the
 	 * list out from under us.  It is probably best to just not try to
 	 * handle threads on other CPU's for now.
 	 */
 	if (td == curthread && PCPU_GET(spinlocks) != NULL)
 		witness_list_locks(PCPU_PTR(spinlocks), db_printf);
 }
 
 DB_SHOW_COMMAND(locks, db_witness_list)
 {
 	struct thread *td;
 
 	if (have_addr)
 		td = db_lookup_thread(addr, true);
 	else
 		td = kdb_thread;
 	witness_ddb_list(td);
 }
 
 DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
 {
 	struct thread *td;
 	struct proc *p;
 
 	/*
 	 * It would be nice to list only threads and processes that actually
 	 * held sleep locks, but that information is currently not exported
 	 * by WITNESS.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!witness_proc_has_locks(p))
 			continue;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (!witness_thread_has_locks(td))
 				continue;
 			db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
 			    p->p_comm, td, td->td_tid);
 			witness_ddb_list(td);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 DB_SHOW_ALIAS(alllocks, db_witness_list_all)
 
 DB_SHOW_COMMAND(witness, db_witness_display)
 {
 
 	witness_ddb_display(db_printf);
 }
 #endif
 
 static void
 sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx)
 {
 	struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
 	struct witness *tmp_w1, *tmp_w2, *w1, *w2;
 	int generation, i, j;
 
 	tmp_data1 = NULL;
 	tmp_data2 = NULL;
 	tmp_w1 = NULL;
 	tmp_w2 = NULL;
 
 	/* Allocate and init temporary storage space. */
 	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	stack_zero(&tmp_data1->wlod_stack);
 	stack_zero(&tmp_data2->wlod_stack);
 
 restart:
 	mtx_lock_spin(&w_mtx);
 	generation = w_generation;
 	mtx_unlock_spin(&w_mtx);
 	sbuf_printf(sb, "Number of known direct relationships is %d\n",
 	    w_lohash.wloh_count);
 	for (i = 1; i < w_max_used_index; i++) {
 		mtx_lock_spin(&w_mtx);
 		if (generation != w_generation) {
 			mtx_unlock_spin(&w_mtx);
 
 			/* The graph has changed, try again. */
 			*oldidx = 0;
 			sbuf_clear(sb);
 			goto restart;
 		}
 
 		w1 = &w_data[i];
 		if (w1->w_reversed == 0) {
 			mtx_unlock_spin(&w_mtx);
 			continue;
 		}
 
 		/* Copy w1 locally so we can release the spin lock. */
 		*tmp_w1 = *w1;
 		mtx_unlock_spin(&w_mtx);
 
 		if (tmp_w1->w_reversed == 0)
 			continue;
 		for (j = 1; j < w_max_used_index; j++) {
 			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
 				continue;
 
 			mtx_lock_spin(&w_mtx);
 			if (generation != w_generation) {
 				mtx_unlock_spin(&w_mtx);
 
 				/* The graph has changed, try again. */
 				*oldidx = 0;
 				sbuf_clear(sb);
 				goto restart;
 			}
 
 			w2 = &w_data[j];
 			data1 = witness_lock_order_get(w1, w2);
 			data2 = witness_lock_order_get(w2, w1);
 
 			/*
 			 * Copy information locally so we can release the
 			 * spin lock.
 			 */
 			*tmp_w2 = *w2;
 
 			if (data1) {
 				stack_zero(&tmp_data1->wlod_stack);
 				stack_copy(&data1->wlod_stack,
 				    &tmp_data1->wlod_stack);
 			}
 			if (data2 && data2 != data1) {
 				stack_zero(&tmp_data2->wlod_stack);
 				stack_copy(&data2->wlod_stack,
 				    &tmp_data2->wlod_stack);
 			}
 			mtx_unlock_spin(&w_mtx);
 
 			sbuf_printf(sb,
 	    "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
 			    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 			    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 			if (data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data1->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 			if (data2 && data2 != data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name, 
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data2->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 		}
 	}
 	mtx_lock_spin(&w_mtx);
 	if (generation != w_generation) {
 		mtx_unlock_spin(&w_mtx);
 
 		/*
 		 * The graph changed while we were printing stack data,
 		 * try again.
 		 */
 		*oldidx = 0;
 		sbuf_clear(sb);
 		goto restart;
 	}
 	mtx_unlock_spin(&w_mtx);
 
 	/* Free temporary storage space. */
 	free(tmp_data1, M_TEMP);
 	free(tmp_data2, M_TEMP);
 	free(tmp_w1, M_TEMP);
 	free(tmp_w2, M_TEMP);
 }
 
 static int
 sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *sb;
 	int error;
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 	sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_print_witness_badstacks(sb, &req->oldidx);
 
 	sbuf_finish(sb);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 #ifdef DDB
 static int
 sbuf_db_printf_drain(void *arg __unused, const char *data, int len)
 {
 
 	return (db_printf("%.*s", len, data));
 }
 
 DB_SHOW_COMMAND(badstacks, db_witness_badstacks)
 {
 	struct sbuf sb;
 	char buffer[128];
 	size_t dummy;
 
 	sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN);
 	sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL);
 	sbuf_print_witness_badstacks(&sb, &dummy);
 	sbuf_finish(&sb);
 }
 #endif
 
 static int
 sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS)
 {
 	static const struct {
 		enum witness_channel channel;
 		const char *name;
 	} channels[] = {
 		{ WITNESS_CONSOLE, "console" },
 		{ WITNESS_LOG, "log" },
 		{ WITNESS_NONE, "none" },
 	};
 	char buf[16];
 	u_int i;
 	int error;
 
 	buf[0] = '\0';
 	for (i = 0; i < nitems(channels); i++)
 		if (witness_channel == channels[i].channel) {
 			snprintf(buf, sizeof(buf), "%s", channels[i].name);
 			break;
 		}
 
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	error = EINVAL;
 	for (i = 0; i < nitems(channels); i++)
 		if (strcmp(channels[i].name, buf) == 0) {
 			witness_channel = channels[i].channel;
 			error = 0;
 			break;
 		}
 	return (error);
 }
 
 static int
 sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
 {
 	struct witness *w;
 	struct sbuf *sb;
 	int error;
+
+#ifdef __i386__
+	error = SYSCTL_OUT(req, w_notallowed, sizeof(w_notallowed));
+	return (error);
+#endif
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 	sbuf_printf(sb, "\n");
 
 	mtx_lock_spin(&w_mtx);
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 	STAILQ_FOREACH(w, &w_all, w_list)
 		witness_add_fullgraph(sb, w);
 	mtx_unlock_spin(&w_mtx);
 
 	/*
 	 * Close the sbuf and return to userland.
 	 */
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = witness_watch;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (value > 1 || value < -1 ||
 	    (witness_watch == -1 && value != witness_watch))
 		return (EINVAL);
 	witness_watch = value;
 	return (0);
 }
 
 static void
 witness_add_fullgraph(struct sbuf *sb, struct witness *w)
 {
 	int i;
 
 	if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
 		return;
 	w->w_displayed = 1;
 
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
 			sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
 			    w_data[i].w_name);
 			witness_add_fullgraph(sb, &w_data[i]);
 		}
 	}
 }
 
 /*
  * A simple hash function. Takes a key pointer and a key size. If size == 0,
  * interprets the key as a string and reads until the null
  * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
  * hash value computed from the key.
  */
 static uint32_t
 witness_hash_djb2(const uint8_t *key, uint32_t size)
 {
 	unsigned int hash = 5381;
 	int i;
 
 	/* hash = hash * 33 + key[i] */
 	if (size)
 		for (i = 0; i < size; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 	else
 		for (i = 0; key[i] != 0; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 
 	return (hash);
 }
 
 
 /*
  * Initializes the two witness hash tables. Called exactly once from
  * witness_initialize().
  */
 static void
 witness_init_hash_tables(void)
 {
 	int i;
 
 	MPASS(witness_cold);
 
 	/* Initialize the hash tables. */
 	for (i = 0; i < WITNESS_HASH_SIZE; i++)
 		w_hash.wh_array[i] = NULL;
 
 	w_hash.wh_size = WITNESS_HASH_SIZE;
 	w_hash.wh_count = 0;
 
 	/* Initialize the lock order data hash. */
 	w_lofree = NULL;
 	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
 		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
 		w_lodata[i].wlod_next = w_lofree;
 		w_lofree = &w_lodata[i];
 	}
 	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
 	w_lohash.wloh_count = 0;
 	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
 		w_lohash.wloh_array[i] = NULL;
 }
 
 static struct witness *
 witness_hash_get(const char *key)
 {
 	struct witness *w;
 	uint32_t hash;
 	
 	MPASS(key != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
 	w = w_hash.wh_array[hash];
 	while (w != NULL) {
 		if (strcmp(w->w_name, key) == 0)
 			goto out;
 		w = w->w_hash_next;
 	}
 
 out:
 	return (w);
 }
 
 static void
 witness_hash_put(struct witness *w)
 {
 	uint32_t hash;
 
 	MPASS(w != NULL);
 	MPASS(w->w_name != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	KASSERT(witness_hash_get(w->w_name) == NULL,
 	    ("%s: trying to add a hash entry that already exists!", __func__));
 	KASSERT(w->w_hash_next == NULL,
 	    ("%s: w->w_hash_next != NULL", __func__));
 
 	hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
 	w->w_hash_next = w_hash.wh_array[hash];
 	w_hash.wh_array[hash] = w;
 	w_hash.wh_count++;
 }
 
 
 static struct witness_lock_order_data *
 witness_lock_order_get(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if ((w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
 		goto out;
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	data = w_lohash.wloh_array[hash];
 	while (data != NULL) {
 		if (witness_lock_order_key_equal(&data->wlod_key, &key))
 			break;
 		data = data->wlod_next;
 	}
 
 out:
 	return (data);
 }
 
 /*
  * Verify that parent and child have a known relationship, are not the same,
  * and child is actually a child of parent.  This is done without w_mtx
  * to avoid contention in the common case.
  */
 static int
 witness_lock_order_check(struct witness *parent, struct witness *child)
 {
 
 	if (parent != child &&
 	    w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN &&
 	    isitmychild(parent, child))
 		return (1);
 
 	return (0);
 }
 
 static int
 witness_lock_order_add(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 	
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if (w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN)
 		return (1);
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
 	data = w_lofree;
 	if (data == NULL)
 		return (0);
 	w_lofree = data->wlod_next;
 	data->wlod_next = w_lohash.wloh_array[hash];
 	data->wlod_key = key;
 	w_lohash.wloh_array[hash] = data;
 	w_lohash.wloh_count++;
 	stack_zero(&data->wlod_stack);
 	stack_save(&data->wlod_stack);
 	return (1);
 }
 
 /* Call this whenever the structure of the witness graph changes. */
 static void
 witness_increment_graph_generation(void)
 {
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	w_generation++;
 }
 
 static int
 witness_output_drain(void *arg __unused, const char *data, int len)
 {
 
 	witness_output("%.*s", len, data);
 	return (len);
 }
 
 static void
 witness_debugger(int cond, const char *msg)
 {
 	char buf[32];
 	struct sbuf sb;
 	struct stack st;
 
 	if (!cond)
 		return;
 
 	if (witness_trace) {
 		sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 		sbuf_set_drain(&sb, witness_output_drain, NULL);
 
 		stack_zero(&st);
 		stack_save(&st);
 		witness_output("stack backtrace:\n");
 		stack_sbuf_print_ddb(&sb, &st);
 
 		sbuf_finish(&sb);
 	}
 
 #ifdef KDB
 	if (witness_kdb)
 		kdb_enter(KDB_WHY_WITNESS, msg);
 #endif
 }
Index: head/sys/x86/acpica/acpi_wakeup.c
===================================================================
--- head/sys/x86/acpica/acpi_wakeup.c	(revision 332488)
+++ head/sys/x86/acpica/acpi_wakeup.c	(revision 332489)
@@ -1,453 +1,458 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001 Takanori Watanabe <takawata@jp.freebsd.org>
  * Copyright (c) 2001-2012 Mitsuru IWASAKI <iwasaki@jp.freebsd.org>
  * Copyright (c) 2003 Peter Wemm
  * Copyright (c) 2008-2012 Jung-uk Kim <jkim@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #if defined(__amd64__)
 #define DEV_APIC
 #else
 #include "opt_apic.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #include <machine/md_var.h>
 
 #ifdef DEV_APIC
 #include <x86/apicreg.h>
 #include <x86/apicvar.h>
 #endif
 #ifdef SMP
 #include <machine/smp.h>
 #include <machine/vmparam.h>
 #endif
 
 #include <contrib/dev/acpica/include/acpi.h>
 
 #include <dev/acpica/acpivar.h>
 
 #include "acpi_wakecode.h"
 #include "acpi_wakedata.h"
 
 /* Make sure the code is less than a page and leave room for the stack. */
 CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024);
 
 extern int		acpi_resume_beep;
 extern int		acpi_reset_video;
 
 #ifdef SMP
 extern struct susppcb	**susppcbs;
 static cpuset_t		suspcpus;
 #else
 static struct susppcb	**susppcbs;
 #endif
 
 static void		*acpi_alloc_wakeup_handler(void **);
 static void		acpi_stop_beep(void *);
 
 #ifdef SMP
 static int		acpi_wakeup_ap(struct acpi_softc *, int);
 static void		acpi_wakeup_cpus(struct acpi_softc *);
 #endif
 
 #ifdef __amd64__
 #define	ACPI_WAKEPAGES	4
 #else
 #define	ACPI_WAKEPAGES	1
 #endif
 
 #define	WAKECODE_FIXUP(offset, type, val)	do {	\
 	type	*addr;					\
 	addr = (type *)(sc->acpi_wakeaddr + (offset));	\
 	*addr = val;					\
 } while (0)
 
 static void
 acpi_stop_beep(void *arg)
 {
 
 	if (acpi_resume_beep != 0)
 		timer_spkr_release();
 }
 
 #ifdef SMP
 static int
 acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
 {
 	struct pcb *pcb;
 	int		vector = (sc->acpi_wakephys >> 12) & 0xff;
 	int		apic_id = cpu_apic_ids[cpu];
 	int		ms;
 
 	pcb = &susppcbs[cpu]->sp_pcb;
 	WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb);
 	WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit);
 	WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base);
 
 	ipi_startup(apic_id, vector);
 
 	/* Wait up to 5 seconds for it to resume. */
 	for (ms = 0; ms < 5000; ms++) {
 		if (!CPU_ISSET(cpu, &suspended_cpus))
 			return (1);	/* return SUCCESS */
 		DELAY(1000);
 	}
 	return (0);		/* return FAILURE */
 }
 
 #define	WARMBOOT_TARGET		0
+#ifdef __amd64__
 #define	WARMBOOT_OFF		(KERNBASE + 0x0467)
 #define	WARMBOOT_SEG		(KERNBASE + 0x0469)
+#else /* __i386__ */
+#define	WARMBOOT_OFF		(PMAP_MAP_LOW + 0x0467)
+#define	WARMBOOT_SEG		(PMAP_MAP_LOW + 0x0469)
+#endif
 
 #define	CMOS_REG		(0x70)
 #define	CMOS_DATA		(0x71)
 #define	BIOS_RESET		(0x0f)
 #define	BIOS_WARM		(0x0a)
 
 static void
 acpi_wakeup_cpus(struct acpi_softc *sc)
 {
 	uint32_t	mpbioswarmvec;
 	int		cpu;
 	u_char		mpbiosreason;
 
 	/* save the current value of the warm-start vector */
 	mpbioswarmvec = *((uint32_t *)WARMBOOT_OFF);
 	outb(CMOS_REG, BIOS_RESET);
 	mpbiosreason = inb(CMOS_DATA);
 
 	/* setup a vector to our boot code */
 	*((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET;
 	*((volatile u_short *)WARMBOOT_SEG) = sc->acpi_wakephys >> 4;
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 
 	/* Wake up each AP. */
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		if (!CPU_ISSET(cpu, &suspcpus))
 			continue;
 		if (acpi_wakeup_ap(sc, cpu) == 0) {
 			/* restore the warmstart vector */
 			*(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
 			panic("acpi_wakeup: failed to resume AP #%d (PHY #%d)",
 			    cpu, cpu_apic_ids[cpu]);
 		}
 	}
 
 #ifdef __i386__
 	/*
 	 * Remove the identity mapping of low memory for all CPUs and sync
 	 * the TLB for the BSP.  The APs are now spinning in
 	 * cpususpend_handler() and we will release them soon.  Then each
 	 * will invalidate its TLB.
 	 */
-	kernel_pmap->pm_pdir[0] = 0;
+	PTD[KPTDI] = 0;
 	invltlb_glob();
 #endif
 
 	/* restore the warmstart vector */
 	*(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
 
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 }
 #endif
 
 int
 acpi_sleep_machdep(struct acpi_softc *sc, int state)
 {
 	ACPI_STATUS	status;
 	struct pcb	*pcb;
 
 	if (sc->acpi_wakeaddr == 0ul)
 		return (-1);	/* couldn't alloc wake memory */
 
 #ifdef SMP
 	suspcpus = all_cpus;
 	CPU_CLR(PCPU_GET(cpuid), &suspcpus);
 #endif
 
 	if (acpi_resume_beep != 0)
 		timer_spkr_acquire();
 
 	AcpiSetFirmwareWakingVector(sc->acpi_wakephys, 0);
 
 	intr_suspend();
 
 	pcb = &susppcbs[0]->sp_pcb;
 	if (savectx(pcb)) {
 #ifdef __amd64__
 		fpususpend(susppcbs[0]->sp_fpususpend);
 #else
 		npxsuspend(susppcbs[0]->sp_fpususpend);
 #endif
 #ifdef SMP
 		if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) {
 			device_printf(sc->acpi_dev, "Failed to suspend APs\n");
 			return (0);	/* couldn't sleep */
 		}
 #endif
 
 		WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0));
 		WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0));
 
 #ifdef __amd64__
 		WAKECODE_FIXUP(wakeup_efer, uint64_t, rdmsr(MSR_EFER) &
 		    ~(EFER_LMA));
 #else
 		WAKECODE_FIXUP(wakeup_cr4, register_t, pcb->pcb_cr4);
 #endif
 		WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb);
 		WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit);
 		WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base);
 
 #ifdef __i386__
 		/*
 		 * Map some low memory with virt == phys for ACPI wakecode
 		 * to use to jump to high memory after enabling paging. This
 		 * is the same as for similar jump in locore, except the
 		 * jump is a single instruction, and we know its address
 		 * more precisely so only need a single PTD, and we have to
 		 * be careful to use the kernel map (PTD[0] is for curthread
 		 * which may be a user thread in deprecated APIs).
 		 */
-		kernel_pmap->pm_pdir[0] = PTD[KPTDI];
+		PTD[KPTDI] = PTD[LOWPTDI];
 #endif
 
 		/* Call ACPICA to enter the desired sleep state */
 		if (state == ACPI_STATE_S4 && sc->acpi_s4bios)
 			status = AcpiEnterSleepStateS4bios();
 		else
 			status = AcpiEnterSleepState(state);
 		if (ACPI_FAILURE(status)) {
 			device_printf(sc->acpi_dev,
 			    "AcpiEnterSleepState failed - %s\n",
 			    AcpiFormatException(status));
 			return (0);	/* couldn't sleep */
 		}
 
 		for (;;)
 			ia32_pause();
 	} else {
 #ifdef __amd64__
 		fpuresume(susppcbs[0]->sp_fpususpend);
 #else
 		npxresume(susppcbs[0]->sp_fpususpend);
 #endif
 	}
 
 	return (1);	/* wakeup successfully */
 }
 
 int
 acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result,
     int intr_enabled)
 {
 
 	if (sleep_result == -1)
 		return (sleep_result);
 
 	if (!intr_enabled) {
 		/* Wakeup MD procedures in interrupt disabled context */
 		if (sleep_result == 1) {
 			pmap_init_pat();
 			initializecpu();
 			PCPU_SET(switchtime, 0);
 			PCPU_SET(switchticks, ticks);
 #ifdef DEV_APIC
 			lapic_xapic_mode();
 #endif
 #ifdef SMP
 			if (!CPU_EMPTY(&suspcpus))
 				acpi_wakeup_cpus(sc);
 #endif
 		}
 
 #ifdef SMP
 		if (!CPU_EMPTY(&suspcpus))
 			resume_cpus(suspcpus);
 #endif
 		mca_resume();
 #ifdef __amd64__
 		if (vmm_resume_p != NULL)
 			vmm_resume_p();
 #endif
 		intr_resume(/*suspend_cancelled*/false);
 
 		AcpiSetFirmwareWakingVector(0, 0);
 	} else {
 		/* Wakeup MD procedures in interrupt enabled context */
 		if (sleep_result == 1 && mem_range_softc.mr_op != NULL &&
 		    mem_range_softc.mr_op->reinit != NULL)
 			mem_range_softc.mr_op->reinit(&mem_range_softc);
 	}
 
 	return (sleep_result);
 }
 
 static void *
 acpi_alloc_wakeup_handler(void *wakepages[ACPI_WAKEPAGES])
 {
 	int		i;
 
 	memset(wakepages, 0, ACPI_WAKEPAGES * sizeof(*wakepages));
 
 	/*
 	 * Specify the region for our wakeup code.  We want it in the low 1 MB
 	 * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA
 	 * (less than 128KB, below 0xa0000, must be excluded by SMAP and DSDT),
 	 * and ROM area (0xa0000 and above).  The temporary page tables must be
 	 * page-aligned.
 	 */
 	for (i = 0; i < ACPI_WAKEPAGES; i++) {
 		wakepages[i] = contigmalloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT,
 		    0x500, 0xa0000, PAGE_SIZE, 0ul);
 		if (wakepages[i] == NULL) {
 			printf("%s: can't alloc wake memory\n", __func__);
 			goto freepages;
 		}
 	}
 	if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL,
 	    EVENTHANDLER_PRI_LAST) == NULL) {
 		printf("%s: can't register event handler\n", __func__);
 		goto freepages;
 	}
 	susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK);
 	for (i = 0; i < mp_ncpus; i++) {
 		susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK);
 		susppcbs[i]->sp_fpususpend = alloc_fpusave(M_WAITOK);
 	}
 
 	return (wakepages);
 
 freepages:
 	for (i = 0; i < ACPI_WAKEPAGES; i++)
 		if (wakepages[i] != NULL)
 			contigfree(wakepages[i], PAGE_SIZE, M_DEVBUF);
 	return (NULL);
 }
 
 void
 acpi_install_wakeup_handler(struct acpi_softc *sc)
 {
 	static void	*wakeaddr;
 	void		*wakepages[ACPI_WAKEPAGES];
 #ifdef __amd64__
 	uint64_t	*pt4, *pt3, *pt2;
 	vm_paddr_t	pt4pa, pt3pa, pt2pa;
 	int		i;
 #endif
 
 	if (wakeaddr != NULL)
 		return;
 
 	if (acpi_alloc_wakeup_handler(wakepages) == NULL)
 		return;
 
 	wakeaddr = wakepages[0];
 	sc->acpi_wakeaddr = (vm_offset_t)wakeaddr;
 	sc->acpi_wakephys = vtophys(wakeaddr);
 
 #ifdef __amd64__
 	pt4 = wakepages[1];
 	pt3 = wakepages[2];
 	pt2 = wakepages[3];
 	pt4pa = vtophys(pt4);
 	pt3pa = vtophys(pt3);
 	pt2pa = vtophys(pt2);
 #endif
 
 	bcopy(wakecode, (void *)sc->acpi_wakeaddr, sizeof(wakecode));
 
 	/* Patch GDT base address, ljmp targets. */
 	WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t,
 	    sc->acpi_wakephys + bootgdt);
 	WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t,
 	    sc->acpi_wakephys + wakeup_32);
 #ifdef __amd64__
 	WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t,
 	    sc->acpi_wakephys + wakeup_64);
 	WAKECODE_FIXUP(wakeup_pagetables, uint32_t, pt4pa);
 #endif
 
 	/* Save pointers to some global data. */
 	WAKECODE_FIXUP(wakeup_ret, void *, resumectx);
 #ifndef __amd64__
 #if defined(PAE) || defined(PAE_TABLES)
 	WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdpt));
 #else
 	WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdir));
 #endif
 
 #else /* __amd64__ */
 	/* Create the initial 1GB replicated page tables */
 	for (i = 0; i < 512; i++) {
 		/*
 		 * Each slot of the level 4 pages points
 		 * to the same level 3 page
 		 */
 		pt4[i] = (uint64_t)pt3pa;
 		pt4[i] |= PG_V | PG_RW | PG_U;
 
 		/*
 		 * Each slot of the level 3 pages points
 		 * to the same level 2 page
 		 */
 		pt3[i] = (uint64_t)pt2pa;
 		pt3[i] |= PG_V | PG_RW | PG_U;
 
 		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
 		pt2[i] = i * (2 * 1024 * 1024);
 		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
 	}
 #endif /* !__amd64__ */
 
 	if (bootverbose)
 		device_printf(sc->acpi_dev, "wakeup code va %#jx pa %#jx\n",
 		    (uintmax_t)sc->acpi_wakeaddr, (uintmax_t)sc->acpi_wakephys);
 }
Index: head/sys/x86/x86/local_apic.c
===================================================================
--- head/sys/x86/x86/local_apic.c	(revision 332488)
+++ head/sys/x86/x86/local_apic.c	(revision 332489)
@@ -1,2152 +1,2150 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Local APIC support on Pentium and later processors.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/timeet.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <x86/init.h>
 
 #ifdef DDB
 #include <sys/interrupt.h>
 #include <ddb/ddb.h>
 #endif
 
 #ifdef __amd64__
 #define	SDT_APIC	SDT_SYSIGT
-#define	SDT_APICT	SDT_SYSIGT
 #define	GSEL_APIC	0
 #else
 #define	SDT_APIC	SDT_SYS386IGT
-#define	SDT_APICT	SDT_SYS386TGT
 #define	GSEL_APIC	GSEL(GCODE_SEL, SEL_KPL)
 #endif
 
 static MALLOC_DEFINE(M_LAPIC, "local_apic", "Local APIC items");
 
 /* Sanity checks on IDT vectors. */
 CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT);
 CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS);
 CTASSERT(APIC_LOCAL_INTS == 240);
 CTASSERT(IPI_STOP < APIC_SPURIOUS_INT);
 
 /* Magic IRQ values for the timer and syscalls. */
 #define	IRQ_TIMER	(NUM_IO_INTS + 1)
 #define	IRQ_SYSCALL	(NUM_IO_INTS + 2)
 #define	IRQ_DTRACE_RET	(NUM_IO_INTS + 3)
 #define	IRQ_EVTCHN	(NUM_IO_INTS + 4)
 
 enum lat_timer_mode {
 	LAT_MODE_UNDEF =	0,
 	LAT_MODE_PERIODIC =	1,
 	LAT_MODE_ONESHOT =	2,
 	LAT_MODE_DEADLINE =	3,
 };
 
 /*
  * Support for local APICs.  Local APICs manage interrupts on each
  * individual processor as opposed to I/O APICs which receive interrupts
  * from I/O devices and then forward them on to the local APICs.
  *
  * Local APICs can also send interrupts to each other thus providing the
  * mechanism for IPIs.
  */
 
 struct lvt {
 	u_int lvt_edgetrigger:1;
 	u_int lvt_activehi:1;
 	u_int lvt_masked:1;
 	u_int lvt_active:1;
 	u_int lvt_mode:16;
 	u_int lvt_vector:8;
 };
 
 struct lapic {
 	struct lvt la_lvts[APIC_LVT_MAX + 1];
 	struct lvt la_elvts[APIC_ELVT_MAX + 1];;
 	u_int la_id:8;
 	u_int la_cluster:4;
 	u_int la_cluster_id:2;
 	u_int la_present:1;
 	u_long *la_timer_count;
 	uint64_t la_timer_period;
 	enum lat_timer_mode la_timer_mode;
 	uint32_t lvt_timer_base;
 	uint32_t lvt_timer_last;
 	/* Include IDT_SYSCALL to make indexing easier. */
 	int la_ioint_irqs[APIC_NUM_IOINTS + 1];
 } static *lapics;
 
 /* Global defaults for local APIC LVT entries. */
 static struct lvt lvts[APIC_LVT_MAX + 1] = {
 	{ 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 },	/* LINT0: masked ExtINT */
 	{ 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },	/* LINT1: NMI */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT },	/* Timer */
 	{ 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },	/* Error */
 	{ 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },	/* PMC */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },	/* Thermal */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },	/* CMCI */
 };
 
 /* Global defaults for AMD local APIC ELVT entries. */
 static struct lvt elvts[APIC_ELVT_MAX + 1] = {
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT },
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
 };
 
 static inthand_t *ioint_handlers[] = {
 	NULL,			/* 0 - 31 */
 	IDTVEC(apic_isr1),	/* 32 - 63 */
 	IDTVEC(apic_isr2),	/* 64 - 95 */
 	IDTVEC(apic_isr3),	/* 96 - 127 */
 	IDTVEC(apic_isr4),	/* 128 - 159 */
 	IDTVEC(apic_isr5),	/* 160 - 191 */
 	IDTVEC(apic_isr6),	/* 192 - 223 */
 	IDTVEC(apic_isr7),	/* 224 - 255 */
 };
 
 static inthand_t *ioint_pti_handlers[] = {
 	NULL,			/* 0 - 31 */
 	IDTVEC(apic_isr1_pti),	/* 32 - 63 */
 	IDTVEC(apic_isr2_pti),	/* 64 - 95 */
 	IDTVEC(apic_isr3_pti),	/* 96 - 127 */
 	IDTVEC(apic_isr4_pti),	/* 128 - 159 */
 	IDTVEC(apic_isr5_pti),	/* 160 - 191 */
 	IDTVEC(apic_isr6_pti),	/* 192 - 223 */
 	IDTVEC(apic_isr7_pti),	/* 224 - 255 */
 };
 
 static u_int32_t lapic_timer_divisors[] = {
 	APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
 	APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
 };
 
 extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
 
 volatile char *lapic_map;
 vm_paddr_t lapic_paddr;
 int x2apic_mode;
 int lapic_eoi_suppression;
 static int lapic_timer_tsc_deadline;
 static u_long lapic_timer_divisor, count_freq;
 static struct eventtimer lapic_et;
 #ifdef SMP
 static uint64_t lapic_ipi_wait_mult;
 #endif
 unsigned int max_apic_id;
 
 SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options");
 SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, "");
 SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD,
     &lapic_eoi_suppression, 0, "");
 SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD,
     &lapic_timer_tsc_deadline, 0, "");
 
 static uint32_t
 lapic_read32(enum LAPIC_REGISTERS reg)
 {
 	uint32_t res;
 
 	if (x2apic_mode) {
 		res = rdmsr32(MSR_APIC_000 + reg);
 	} else {
 		res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL);
 	}
 	return (res);
 }
 
 static void
 lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val)
 {
 
 	if (x2apic_mode) {
 		mfence();
 		lfence();
 		wrmsr(MSR_APIC_000 + reg, val);
 	} else {
 		*(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
 	}
 }
 
 static void
 lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val)
 {
 
 	if (x2apic_mode) {
 		wrmsr(MSR_APIC_000 + reg, val);
 	} else {
 		*(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
 	}
 }
 
 #ifdef SMP
 static uint64_t
 lapic_read_icr(void)
 {
 	uint64_t v;
 	uint32_t vhi, vlo;
 
 	if (x2apic_mode) {
 		v = rdmsr(MSR_APIC_000 + LAPIC_ICR_LO);
 	} else {
 		vhi = lapic_read32(LAPIC_ICR_HI);
 		vlo = lapic_read32(LAPIC_ICR_LO);
 		v = ((uint64_t)vhi << 32) | vlo;
 	}
 	return (v);
 }
 
 static uint64_t
 lapic_read_icr_lo(void)
 {
 
 	return (lapic_read32(LAPIC_ICR_LO));
 }
 
 static void
 lapic_write_icr(uint32_t vhi, uint32_t vlo)
 {
 	uint64_t v;
 
 	if (x2apic_mode) {
 		v = ((uint64_t)vhi << 32) | vlo;
 		mfence();
 		wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v);
 	} else {
 		lapic_write32(LAPIC_ICR_HI, vhi);
 		lapic_write32(LAPIC_ICR_LO, vlo);
 	}
 }
 #endif /* SMP */
 
 static void
 native_lapic_enable_x2apic(void)
 {
 	uint64_t apic_base;
 
 	apic_base = rdmsr(MSR_APICBASE);
 	apic_base |= APICBASE_X2APIC | APICBASE_ENABLED;
 	wrmsr(MSR_APICBASE, apic_base);
 }
 
 static bool
 native_lapic_is_x2apic(void)
 {
 	uint64_t apic_base;
 
 	apic_base = rdmsr(MSR_APICBASE);
 	return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) ==
 	    (APICBASE_X2APIC | APICBASE_ENABLED));
 }
 
 static void	lapic_enable(void);
 static void	lapic_resume(struct pic *pic, bool suspend_cancelled);
 static void	lapic_timer_oneshot(struct lapic *);
 static void	lapic_timer_oneshot_nointr(struct lapic *, uint32_t);
 static void	lapic_timer_periodic(struct lapic *);
 static void	lapic_timer_deadline(struct lapic *);
 static void	lapic_timer_stop(struct lapic *);
 static void	lapic_timer_set_divisor(u_int divisor);
 static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
 static int	lapic_et_start(struct eventtimer *et,
 		    sbintime_t first, sbintime_t period);
 static int	lapic_et_stop(struct eventtimer *et);
 static u_int	apic_idt_to_irq(u_int apic_id, u_int vector);
 static void	lapic_set_tpr(u_int vector);
 
 struct pic lapic_pic = { .pic_resume = lapic_resume };
 
 /* Forward declarations for apic_ops */
 static void	native_lapic_create(u_int apic_id, int boot_cpu);
 static void	native_lapic_init(vm_paddr_t addr);
 static void	native_lapic_xapic_mode(void);
 static void	native_lapic_setup(int boot);
 static void	native_lapic_dump(const char *str);
 static void	native_lapic_disable(void);
 static void	native_lapic_eoi(void);
 static int	native_lapic_id(void);
 static int	native_lapic_intr_pending(u_int vector);
 static u_int	native_apic_cpuid(u_int apic_id);
 static u_int	native_apic_alloc_vector(u_int apic_id, u_int irq);
 static u_int	native_apic_alloc_vectors(u_int apic_id, u_int *irqs,
 		    u_int count, u_int align);
 static void 	native_apic_disable_vector(u_int apic_id, u_int vector);
 static void 	native_apic_enable_vector(u_int apic_id, u_int vector);
 static void 	native_apic_free_vector(u_int apic_id, u_int vector, u_int irq);
 static void 	native_lapic_set_logical_id(u_int apic_id, u_int cluster,
 		    u_int cluster_id);
 static int 	native_lapic_enable_pmc(void);
 static void 	native_lapic_disable_pmc(void);
 static void 	native_lapic_reenable_pmc(void);
 static void 	native_lapic_enable_cmc(void);
 static int 	native_lapic_enable_mca_elvt(void);
 static int 	native_lapic_set_lvt_mask(u_int apic_id, u_int lvt,
 		    u_char masked);
 static int 	native_lapic_set_lvt_mode(u_int apic_id, u_int lvt,
 		    uint32_t mode);
 static int 	native_lapic_set_lvt_polarity(u_int apic_id, u_int lvt,
 		    enum intr_polarity pol);
 static int 	native_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
 		    enum intr_trigger trigger);
 #ifdef SMP
 static void 	native_lapic_ipi_raw(register_t icrlo, u_int dest);
 static void 	native_lapic_ipi_vectored(u_int vector, int dest);
 static int 	native_lapic_ipi_wait(int delay);
 #endif /* SMP */
 static int	native_lapic_ipi_alloc(inthand_t *ipifunc);
 static void	native_lapic_ipi_free(int vector);
 
 struct apic_ops apic_ops = {
 	.create			= native_lapic_create,
 	.init			= native_lapic_init,
 	.xapic_mode		= native_lapic_xapic_mode,
 	.is_x2apic		= native_lapic_is_x2apic,
 	.setup			= native_lapic_setup,
 	.dump			= native_lapic_dump,
 	.disable		= native_lapic_disable,
 	.eoi			= native_lapic_eoi,
 	.id			= native_lapic_id,
 	.intr_pending		= native_lapic_intr_pending,
 	.set_logical_id		= native_lapic_set_logical_id,
 	.cpuid			= native_apic_cpuid,
 	.alloc_vector		= native_apic_alloc_vector,
 	.alloc_vectors		= native_apic_alloc_vectors,
 	.enable_vector		= native_apic_enable_vector,
 	.disable_vector		= native_apic_disable_vector,
 	.free_vector		= native_apic_free_vector,
 	.enable_pmc		= native_lapic_enable_pmc,
 	.disable_pmc		= native_lapic_disable_pmc,
 	.reenable_pmc		= native_lapic_reenable_pmc,
 	.enable_cmc		= native_lapic_enable_cmc,
 	.enable_mca_elvt	= native_lapic_enable_mca_elvt,
 #ifdef SMP
 	.ipi_raw		= native_lapic_ipi_raw,
 	.ipi_vectored		= native_lapic_ipi_vectored,
 	.ipi_wait		= native_lapic_ipi_wait,
 #endif
 	.ipi_alloc		= native_lapic_ipi_alloc,
 	.ipi_free		= native_lapic_ipi_free,
 	.set_lvt_mask		= native_lapic_set_lvt_mask,
 	.set_lvt_mode		= native_lapic_set_lvt_mode,
 	.set_lvt_polarity	= native_lapic_set_lvt_polarity,
 	.set_lvt_triggermode	= native_lapic_set_lvt_triggermode,
 };
 
 static uint32_t
 lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value)
 {
 
 	value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM |
 	    APIC_LVT_VECTOR);
 	if (lvt->lvt_edgetrigger == 0)
 		value |= APIC_LVT_TM;
 	if (lvt->lvt_activehi == 0)
 		value |= APIC_LVT_IIPP_INTALO;
 	if (lvt->lvt_masked)
 		value |= APIC_LVT_M;
 	value |= lvt->lvt_mode;
 	switch (lvt->lvt_mode) {
 	case APIC_LVT_DM_NMI:
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
 		if (!lvt->lvt_edgetrigger && bootverbose) {
 			printf("lapic%u: Forcing LINT%u to edge trigger\n",
 			    la->la_id, pin);
 			value &= ~APIC_LVT_TM;
 		}
 		/* Use a vector of 0. */
 		break;
 	case APIC_LVT_DM_FIXED:
 		value |= lvt->lvt_vector;
 		break;
 	default:
 		panic("bad APIC LVT delivery mode: %#x\n", value);
 	}
 	return (value);
 }
 
 static uint32_t
 lvt_mode(struct lapic *la, u_int pin, uint32_t value)
 {
 	struct lvt *lvt;
 
 	KASSERT(pin <= APIC_LVT_MAX,
 	    ("%s: pin %u out of range", __func__, pin));
 	if (la->la_lvts[pin].lvt_active)
 		lvt = &la->la_lvts[pin];
 	else
 		lvt = &lvts[pin];
 
 	return (lvt_mode_impl(la, lvt, pin, value));
 }
 
 static uint32_t
 elvt_mode(struct lapic *la, u_int idx, uint32_t value)
 {
 	struct lvt *elvt;
 
 	KASSERT(idx <= APIC_ELVT_MAX,
 	    ("%s: idx %u out of range", __func__, idx));
 
 	elvt = &la->la_elvts[idx];
 	KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx));
 	KASSERT(elvt->lvt_edgetrigger,
 	    ("%s: ELVT%u is not edge triggered", __func__, idx));
 	KASSERT(elvt->lvt_activehi,
 	    ("%s: ELVT%u is not active high", __func__, idx));
 	return (lvt_mode_impl(la, elvt, idx, value));
 }
 
 /*
  * Map the local APIC and setup necessary interrupt vectors.
  */
 static void
 native_lapic_init(vm_paddr_t addr)
 {
 #ifdef SMP
 	uint64_t r, r1, r2, rx;
 #endif
 	uint32_t ver;
 	u_int regs[4];
 	int i, arat;
 
 	/*
 	 * Enable x2APIC mode if possible. Map the local APIC
 	 * registers page.
 	 *
 	 * Keep the LAPIC registers page mapped uncached for x2APIC
 	 * mode too, to have direct map page attribute set to
 	 * uncached.  This is needed to work around CPU errata present
 	 * on all Intel processors.
 	 */
 	KASSERT(trunc_page(addr) == addr,
 	    ("local APIC not aligned on a page boundary"));
 	lapic_paddr = addr;
 	lapic_map = pmap_mapdev(addr, PAGE_SIZE);
 	if (x2apic_mode) {
 		native_lapic_enable_x2apic();
 		lapic_map = NULL;
 	}
 
 	/* Setup the spurious interrupt handler. */
 	setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL,
 	    GSEL_APIC);
 
 	/* Perform basic initialization of the BSP's local APIC. */
 	lapic_enable();
 
 	/* Set BSP's per-CPU local APIC ID. */
 	PCPU_SET(apic_id, lapic_id());
 
 	/* Local APIC timer interrupt. */
 	setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
 	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* Local APIC error interrupt. */
 	setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
 	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* XXX: Thermal interrupt */
 
 	/* Local APIC CMCI. */
 	setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
-	    SDT_APICT, SEL_KPL, GSEL_APIC);
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
 		arat = 0;
 		/* Intel CPUID 0x06 EAX[2] set if APIC timer runs in C3. */
 		if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_high >= 6) {
 			do_cpuid(0x06, regs);
 			if ((regs[0] & CPUTPM1_ARAT) != 0)
 				arat = 1;
 		} else if (cpu_vendor_id == CPU_VENDOR_AMD &&
 		    CPUID_TO_FAMILY(cpu_id) >= 0x12) {
 			arat = 1;
 		}
 		bzero(&lapic_et, sizeof(lapic_et));
 		lapic_et.et_name = "LAPIC";
 		lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT |
 		    ET_FLAGS_PERCPU;
 		lapic_et.et_quality = 600;
 		if (!arat) {
 			lapic_et.et_flags |= ET_FLAGS_C3STOP;
 			lapic_et.et_quality = 100;
 		}
 		if ((cpu_feature & CPUID_TSC) != 0 &&
 		    (cpu_feature2 & CPUID2_TSCDLT) != 0 &&
 		    tsc_is_invariant && tsc_freq != 0) {
 			lapic_timer_tsc_deadline = 1;
 			TUNABLE_INT_FETCH("hw.lapic_tsc_deadline",
 			    &lapic_timer_tsc_deadline);
 		}
 
 		lapic_et.et_frequency = 0;
 		/* We don't know frequency yet, so trying to guess. */
 		lapic_et.et_min_period = 0x00001000LL;
 		lapic_et.et_max_period = SBT_1S;
 		lapic_et.et_start = lapic_et_start;
 		lapic_et.et_stop = lapic_et_stop;
 		lapic_et.et_priv = NULL;
 		et_register(&lapic_et);
 	}
 
 	/*
 	 * Set lapic_eoi_suppression after lapic_enable(), to not
 	 * enable suppression in the hardware prematurely.  Note that
 	 * we by default enable suppression even when system only has
 	 * one IO-APIC, since EOI is broadcasted to all APIC agents,
 	 * including CPUs, otherwise.
 	 *
 	 * It seems that at least some KVM versions report
 	 * EOI_SUPPRESSION bit, but auto-EOI does not work.
 	 */
 	ver = lapic_read32(LAPIC_VERSION);
 	if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) {
 		lapic_eoi_suppression = 1;
 		if (vm_guest == VM_GUEST_KVM) {
 			if (bootverbose)
 				printf(
 		       "KVM -- disabling lapic eoi suppression\n");
 			lapic_eoi_suppression = 0;
 		}
 		TUNABLE_INT_FETCH("hw.lapic_eoi_suppression",
 		    &lapic_eoi_suppression);
 	}
 
 #ifdef SMP
 #define	LOOPS	100000
 	/*
 	 * Calibrate the busy loop waiting for IPI ack in xAPIC mode.
 	 * lapic_ipi_wait_mult contains the number of iterations which
 	 * approximately delay execution for 1 microsecond (the
 	 * argument to native_lapic_ipi_wait() is in microseconds).
 	 *
 	 * We assume that TSC is present and already measured.
 	 * Possible TSC frequency jumps are irrelevant to the
 	 * calibration loop below, the CPU clock management code is
 	 * not yet started, and we do not enter sleep states.
 	 */
 	KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0,
 	    ("TSC not initialized"));
 	if (!x2apic_mode) {
 		r = rdtsc();
 		for (rx = 0; rx < LOOPS; rx++) {
 			(void)lapic_read_icr_lo();
 			ia32_pause();
 		}
 		r = rdtsc() - r;
 		r1 = tsc_freq * LOOPS;
 		r2 = r * 1000000;
 		lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
 		if (bootverbose) {
 			printf("LAPIC: ipi_wait() us multiplier %ju (r %ju "
 			    "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult,
 			    (uintmax_t)r, (uintmax_t)tsc_freq);
 		}
 	}
 #undef LOOPS
 #endif /* SMP */
 }
 
 /*
  * Create a local APIC instance.
  */
 static void
 native_lapic_create(u_int apic_id, int boot_cpu)
 {
 	int i;
 
 	if (apic_id > max_apic_id) {
 		printf("APIC: Ignoring local APIC with ID %d\n", apic_id);
 		if (boot_cpu)
 			panic("Can't ignore BSP");
 		return;
 	}
 	KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u",
 	    apic_id));
 
 	/*
 	 * Assume no local LVT overrides and a cluster of 0 and
 	 * intra-cluster ID of 0.
 	 */
 	lapics[apic_id].la_present = 1;
 	lapics[apic_id].la_id = apic_id;
 	for (i = 0; i <= APIC_LVT_MAX; i++) {
 		lapics[apic_id].la_lvts[i] = lvts[i];
 		lapics[apic_id].la_lvts[i].lvt_active = 0;
 	}
 	for (i = 0; i <= APIC_ELVT_MAX; i++) {
 		lapics[apic_id].la_elvts[i] = elvts[i];
 		lapics[apic_id].la_elvts[i].lvt_active = 0;
 	}
 	for (i = 0; i <= APIC_NUM_IOINTS; i++)
 	    lapics[apic_id].la_ioint_irqs[i] = -1;
 	lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL;
 	lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] =
 	    IRQ_TIMER;
 #ifdef KDTRACE_HOOKS
 	lapics[apic_id].la_ioint_irqs[IDT_DTRACE_RET - APIC_IO_INTS] =
 	    IRQ_DTRACE_RET;
 #endif
 #ifdef XENHVM
 	lapics[apic_id].la_ioint_irqs[IDT_EVTCHN - APIC_IO_INTS] = IRQ_EVTCHN;
 #endif
 
 
 #ifdef SMP
 	cpu_add(apic_id, boot_cpu);
 #endif
 }
 
 static inline uint32_t
 amd_read_ext_features(void)
 {
 	uint32_t version;
 
 	if (cpu_vendor_id != CPU_VENDOR_AMD)
 		return (0);
 	version = lapic_read32(LAPIC_VERSION);
 	if ((version & APIC_VER_AMD_EXT_SPACE) != 0)
 		return (lapic_read32(LAPIC_EXT_FEATURES));
 	else
 		return (0);
 }
 
 static inline uint32_t
 amd_read_elvt_count(void)
 {
 	uint32_t extf;
 	uint32_t count;
 
 	extf = amd_read_ext_features();
 	count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT;
 	count = min(count, APIC_ELVT_MAX + 1);
 	return (count);
 }
 
 /*
  * Dump contents of local APIC registers
  */
 static void
 native_lapic_dump(const char* str)
 {
 	uint32_t version;
 	uint32_t maxlvt;
 	uint32_t extf;
 	int elvt_count;
 	int i;
 
 	version = lapic_read32(LAPIC_VERSION);
 	maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
 	printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x",
 	    lapic_read32(LAPIC_ID), version,
 	    lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR));
 	if ((cpu_feature2 & CPUID2_X2APIC) != 0)
 		printf(" x2APIC: %d", x2apic_mode);
 	printf("\n  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
 	    lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1),
 	    lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR));
 	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x",
 	    lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL),
 	    lapic_read32(LAPIC_LVT_ERROR));
 	if (maxlvt >= APIC_LVT_PMC)
 		printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT));
 	printf("\n");
 	if (maxlvt >= APIC_LVT_CMCI)
 		printf("   cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI));
 	extf = amd_read_ext_features();
 	if (extf != 0) {
 		printf("   AMD ext features: 0x%08x\n", extf);
 		elvt_count = amd_read_elvt_count();
 		for (i = 0; i < elvt_count; i++)
 			printf("   AMD elvt%d: 0x%08x\n", i,
 			    lapic_read32(LAPIC_EXT_LVT0 + i));
 	}
 }
 
 static void
 native_lapic_xapic_mode(void)
 {
 	register_t saveintr;
 
 	saveintr = intr_disable();
 	if (x2apic_mode)
 		native_lapic_enable_x2apic();
 	intr_restore(saveintr);
 }
 
 static void
 native_lapic_setup(int boot)
 {
 	struct lapic *la;
 	uint32_t version;
 	uint32_t maxlvt;
 	register_t saveintr;
 	char buf[MAXCOMLEN + 1];
 	int elvt_count;
 	int i;
 
 	saveintr = intr_disable();
 
 	la = &lapics[lapic_id()];
 	KASSERT(la->la_present, ("missing APIC structure"));
 	version = lapic_read32(LAPIC_VERSION);
 	maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 
 	/* Initialize the TPR to allow all interrupts. */
 	lapic_set_tpr(0);
 
 	/* Setup spurious vector and enable the local APIC. */
 	lapic_enable();
 
 	/* Program LINT[01] LVT entries. */
 	lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0,
 	    lapic_read32(LAPIC_LVT_LINT0)));
 	lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1,
 	    lapic_read32(LAPIC_LVT_LINT1)));
 
 	/* Program the PMC LVT entry if present. */
 	if (maxlvt >= APIC_LVT_PMC) {
 		lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
 		    LAPIC_LVT_PCINT));
 	}
 
 	/* Program timer LVT and setup handler. */
 	la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER,
 	    lapic_read32(LAPIC_LVT_TIMER));
 	la->lvt_timer_last = la->lvt_timer_base;
 	lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base);
 	if (boot) {
 		snprintf(buf, sizeof(buf), "cpu%d:timer", PCPU_GET(cpuid));
 		intrcnt_add(buf, &la->la_timer_count);
 	}
 
 	/* Setup the timer if configured. */
 	if (la->la_timer_mode != LAT_MODE_UNDEF) {
 		KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor",
 		    lapic_id()));
 		switch (la->la_timer_mode) {
 		case LAT_MODE_PERIODIC:
 			lapic_timer_set_divisor(lapic_timer_divisor);
 			lapic_timer_periodic(la);
 			break;
 		case LAT_MODE_ONESHOT:
 			lapic_timer_set_divisor(lapic_timer_divisor);
 			lapic_timer_oneshot(la);
 			break;
 		case LAT_MODE_DEADLINE:
 			lapic_timer_deadline(la);
 			break;
 		default:
 			panic("corrupted la_timer_mode %p %d", la,
 			    la->la_timer_mode);
 		}
 	}
 
 	/* Program error LVT and clear any existing errors. */
 	lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR,
 	    lapic_read32(LAPIC_LVT_ERROR)));
 	lapic_write32(LAPIC_ESR, 0);
 
 	/* XXX: Thermal LVT */
 
 	/* Program the CMCI LVT entry if present. */
 	if (maxlvt >= APIC_LVT_CMCI) {
 		lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI,
 		    lapic_read32(LAPIC_LVT_CMCI)));
 	}
 
 	elvt_count = amd_read_elvt_count();
 	for (i = 0; i < elvt_count; i++) {
 		if (la->la_elvts[i].lvt_active)
 			lapic_write32(LAPIC_EXT_LVT0 + i,
 			    elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i)));
 	}
 
 	intr_restore(saveintr);
 }
 
 static void
 native_lapic_reenable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	uint32_t value;
 
 	value = lapic_read32(LAPIC_LVT_PCINT);
 	value &= ~APIC_LVT_M;
 	lapic_write32(LAPIC_LVT_PCINT, value);
 #endif
 }
 
 #ifdef HWPMC_HOOKS
 static void
 lapic_update_pmc(void *dummy)
 {
 	struct lapic *la;
 
 	la = &lapics[lapic_id()];
 	lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
 	    lapic_read32(LAPIC_LVT_PCINT)));
 }
 #endif
 
 static int
 native_lapic_enable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
 	if (!x2apic_mode && lapic_map == NULL)
 		return (0);
 
 	/* Fail if the PMC LVT is not present. */
 	maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < APIC_LVT_PMC)
 		return (0);
 
 	lvts[APIC_LVT_PMC].lvt_masked = 0;
 
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 	smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
 #else
 #ifdef SMP
 	/*
 	 * If hwpmc was loaded at boot time then the APs may not be
 	 * started yet.  In that case, don't forward the request to
 	 * them as they will program the lvt when they start.
 	 */
 	if (smp_started)
 		smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
 	else
 #endif
 		lapic_update_pmc(NULL);
 #endif
 	return (1);
 #else
 	return (0);
 #endif
 }
 
 static void
 native_lapic_disable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
 	if (!x2apic_mode && lapic_map == NULL)
 		return;
 
 	/* Fail if the PMC LVT is not present. */
 	maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < APIC_LVT_PMC)
 		return;
 
 	lvts[APIC_LVT_PMC].lvt_masked = 1;
 
 #ifdef SMP
 	/* The APs should always be started when hwpmc is unloaded. */
 	KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early"));
 #endif
 	smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
 #endif
 }
 
 static void
 lapic_calibrate_initcount(struct eventtimer *et, struct lapic *la)
 {
 	u_long value;
 
 	/* Start off with a divisor of 2 (power on reset default). */
 	lapic_timer_divisor = 2;
 	/* Try to calibrate the local APIC timer. */
 	do {
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT);
 		DELAY(1000000);
 		value = APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER);
 		if (value != APIC_TIMER_MAX_COUNT)
 			break;
 		lapic_timer_divisor <<= 1;
 	} while (lapic_timer_divisor <= 128);
 	if (lapic_timer_divisor > 128)
 		panic("lapic: Divisor too big");
 	if (bootverbose) {
 		printf("lapic: Divisor %lu, Frequency %lu Hz\n",
 		    lapic_timer_divisor, value);
 	}
 	count_freq = value;
 }
 
 static void
 lapic_calibrate_deadline(struct eventtimer *et, struct lapic *la __unused)
 {
 
 	if (bootverbose) {
 		printf("lapic: deadline tsc mode, Frequency %ju Hz\n",
 		    (uintmax_t)tsc_freq);
 	}
 }
 
 static void
 lapic_change_mode(struct eventtimer *et, struct lapic *la,
     enum lat_timer_mode newmode)
 {
 
 	if (la->la_timer_mode == newmode)
 		return;
 	switch (newmode) {
 	case LAT_MODE_PERIODIC:
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		et->et_frequency = count_freq;
 		break;
 	case LAT_MODE_DEADLINE:
 		et->et_frequency = tsc_freq;
 		break;
 	case LAT_MODE_ONESHOT:
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		et->et_frequency = count_freq;
 		break;
 	default:
 		panic("lapic_change_mode %d", newmode);
 	}
 	la->la_timer_mode = newmode;
 	et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
 	et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
 }
 
 static int
 lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
 {
 	struct lapic *la;
 
 	la = &lapics[PCPU_GET(apic_id)];
 	if (et->et_frequency == 0) {
 		lapic_calibrate_initcount(et, la);
 		if (lapic_timer_tsc_deadline)
 			lapic_calibrate_deadline(et, la);
 	}
 	if (period != 0) {
 		lapic_change_mode(et, la, LAT_MODE_PERIODIC);
 		la->la_timer_period = ((uint32_t)et->et_frequency * period) >>
 		    32;
 		lapic_timer_periodic(la);
 	} else if (lapic_timer_tsc_deadline) {
 		lapic_change_mode(et, la, LAT_MODE_DEADLINE);
 		la->la_timer_period = (et->et_frequency * first) >> 32;
 		lapic_timer_deadline(la);
 	} else {
 		lapic_change_mode(et, la, LAT_MODE_ONESHOT);
 		la->la_timer_period = ((uint32_t)et->et_frequency * first) >>
 		    32;
 		lapic_timer_oneshot(la);
 	}
 	return (0);
 }
 
 static int
 lapic_et_stop(struct eventtimer *et)
 {
 	struct lapic *la;
 
 	la = &lapics[PCPU_GET(apic_id)];
 	lapic_timer_stop(la);
 	la->la_timer_mode = LAT_MODE_UNDEF;
 	return (0);
 }
 
 static void
 native_lapic_disable(void)
 {
 	uint32_t value;
 
 	/* Software disable the local APIC. */
 	value = lapic_read32(LAPIC_SVR);
 	value &= ~APIC_SVR_SWEN;
 	lapic_write32(LAPIC_SVR, value);
 }
 
 static void
 lapic_enable(void)
 {
 	uint32_t value;
 
 	/* Program the spurious vector to enable the local APIC. */
 	value = lapic_read32(LAPIC_SVR);
 	value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
 	value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT;
 	if (lapic_eoi_suppression)
 		value |= APIC_SVR_EOI_SUPPRESSION;
 	lapic_write32(LAPIC_SVR, value);
 }
 
 /* Reset the local APIC on the BSP during resume. */
 static void
 lapic_resume(struct pic *pic, bool suspend_cancelled)
 {
 
 	lapic_setup(0);
 }
 
 static int
 native_lapic_id(void)
 {
 	uint32_t v;
 
 	KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped"));
 	v = lapic_read32(LAPIC_ID);
 	if (!x2apic_mode)
 		v >>= APIC_ID_SHIFT;
 	return (v);
 }
 
 static int
 native_lapic_intr_pending(u_int vector)
 {
 	uint32_t irr;
 
 	/*
 	 * The IRR registers are an array of registers each of which
 	 * only describes 32 interrupts in the low 32 bits.  Thus, we
 	 * divide the vector by 32 to get the register index.
 	 * Finally, we modulus the vector by 32 to determine the
 	 * individual bit to test.
 	 */
 	irr = lapic_read32(LAPIC_IRR0 + vector / 32);
 	return (irr & 1 << (vector % 32));
 }
 
 static void
 native_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 	struct lapic *la;
 
 	KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist",
 	    __func__, apic_id));
 	KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big",
 	    __func__, cluster));
 	KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID,
 	    ("%s: intra cluster id %u too big", __func__, cluster_id));
 	la = &lapics[apic_id];
 	la->la_cluster = cluster;
 	la->la_cluster_id = cluster_id;
 }
 
 static int
 native_lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
 {
 
 	if (pin > APIC_LVT_MAX)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_masked = masked;
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_masked = masked;
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked");
 	return (0);
 }
 
 static int
 native_lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
 {
 	struct lvt *lvt;
 
 	if (pin > APIC_LVT_MAX)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvt = &lvts[pin];
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lvt = &lapics[apic_id].la_lvts[pin];
 		lvt->lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	lvt->lvt_mode = mode;
 	switch (mode) {
 	case APIC_LVT_DM_NMI:
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
 		lvt->lvt_edgetrigger = 1;
 		lvt->lvt_activehi = 1;
 		if (mode == APIC_LVT_DM_EXTINT)
 			lvt->lvt_masked = 1;
 		else
 			lvt->lvt_masked = 0;
 		break;
 	default:
 		panic("Unsupported delivery mode: 0x%x\n", mode);
 	}
 	if (bootverbose) {
 		printf(" Routing ");
 		switch (mode) {
 		case APIC_LVT_DM_NMI:
 			printf("NMI");
 			break;
 		case APIC_LVT_DM_SMI:
 			printf("SMI");
 			break;
 		case APIC_LVT_DM_INIT:
 			printf("INIT");
 			break;
 		case APIC_LVT_DM_EXTINT:
 			printf("ExtINT");
 			break;
 		}
 		printf(" -> LINT%u\n", pin);
 	}
 	return (0);
 }
 
 static int
 native_lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
 {
 
 	if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH);
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		lapics[apic_id].la_lvts[pin].lvt_activehi =
 		    (pol == INTR_POLARITY_HIGH);
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u polarity: %s\n", pin,
 		    pol == INTR_POLARITY_HIGH ? "high" : "low");
 	return (0);
 }
 
 static int
 native_lapic_set_lvt_triggermode(u_int apic_id, u_int pin,
      enum intr_trigger trigger)
 {
 
 	if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE);
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_edgetrigger =
 		    (trigger == INTR_TRIGGER_EDGE);
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u trigger: %s\n", pin,
 		    trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
 	return (0);
 }
 
 /*
  * Adjust the TPR of the current CPU so that it blocks all interrupts below
  * the passed in vector.
  */
 static void
 lapic_set_tpr(u_int vector)
 {
 #ifdef CHEAP_TPR
 	lapic_write32(LAPIC_TPR, vector);
 #else
 	uint32_t tpr;
 
 	tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO;
 	tpr |= vector;
 	lapic_write32(LAPIC_TPR, tpr);
 #endif
 }
 
 static void
 native_lapic_eoi(void)
 {
 
 	lapic_write32_nofence(LAPIC_EOI, 0);
 }
 
 void
 lapic_handle_intr(int vector, struct trapframe *frame)
 {
 	struct intsrc *isrc;
 
 	isrc = intr_lookup_source(apic_idt_to_irq(PCPU_GET(apic_id),
 	    vector));
 	intr_execute_handlers(isrc, frame);
 }
 
 void
 lapic_handle_timer(struct trapframe *frame)
 {
 	struct lapic *la;
 	struct trapframe *oldframe;
 	struct thread *td;
 
 	/* Send EOI first thing. */
 	lapic_eoi();
 
 #if defined(SMP) && !defined(SCHED_ULE)
 	/*
 	 * Don't do any accounting for the disabled HTT cores, since it
 	 * will provide misleading numbers for the userland.
 	 *
 	 * No locking is necessary here, since even if we lose the race
 	 * when hlt_cpus_mask changes it is not a big deal, really.
 	 *
 	 * Don't do that for ULE, since ULE doesn't consider hlt_cpus_mask
 	 * and unlike other schedulers it actually schedules threads to
 	 * those CPUs.
 	 */
 	if (CPU_ISSET(PCPU_GET(cpuid), &hlt_cpus_mask))
 		return;
 #endif
 
 	/* Look up our local APIC structure for the tick counters. */
 	la = &lapics[PCPU_GET(apic_id)];
 	(*la->la_timer_count)++;
 	critical_enter();
 	if (lapic_et.et_active) {
 		td = curthread;
 		td->td_intr_nesting_level++;
 		oldframe = td->td_intr_frame;
 		td->td_intr_frame = frame;
 		lapic_et.et_event_cb(&lapic_et, lapic_et.et_arg);
 		td->td_intr_frame = oldframe;
 		td->td_intr_nesting_level--;
 	}
 	critical_exit();
 }
 
 static void
 lapic_timer_set_divisor(u_int divisor)
 {
 
 	KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
 	KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors),
 		("lapic: invalid divisor %u", divisor));
 	lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]);
 }
 
 static void
 lapic_timer_oneshot(struct lapic *la)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_ONE_SHOT;
 	la->lvt_timer_last = value;
 	lapic_write32(LAPIC_LVT_TIMER, value);
 	lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
 }
 
 static void
 lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~APIC_LVTT_TM;
 	value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M;
 	la->lvt_timer_last = value;
 	lapic_write32(LAPIC_LVT_TIMER, value);
 	lapic_write32(LAPIC_ICR_TIMER, count);
 }
 
 static void
 lapic_timer_periodic(struct lapic *la)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_PERIODIC;
 	la->lvt_timer_last = value;
 	lapic_write32(LAPIC_LVT_TIMER, value);
 	lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
 }
 
 static void
 lapic_timer_deadline(struct lapic *la)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_TSCDLT;
 	if (value != la->lvt_timer_last) {
 		la->lvt_timer_last = value;
 		lapic_write32_nofence(LAPIC_LVT_TIMER, value);
 		if (!x2apic_mode)
 			mfence();
 	}
 	wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc());
 }
 
 static void
 lapic_timer_stop(struct lapic *la)
 {
 	uint32_t value;
 
 	if (la->la_timer_mode == LAT_MODE_DEADLINE) {
 		wrmsr(MSR_TSC_DEADLINE, 0);
 		mfence();
 	} else {
 		value = la->lvt_timer_base;
 		value &= ~APIC_LVTT_TM;
 		value |= APIC_LVT_M;
 		la->lvt_timer_last = value;
 		lapic_write32(LAPIC_LVT_TIMER, value);
 	}
 }
 
 void
 lapic_handle_cmc(void)
 {
 
 	lapic_eoi();
 	cmc_intr();
 }
 
 /*
  * Called from the mca_init() to activate the CMC interrupt if this CPU is
  * responsible for monitoring any MC banks for CMC events.  Since mca_init()
  * is called prior to lapic_setup() during boot, this just needs to unmask
  * this CPU's LVT_CMCI entry.
  */
 static void
 native_lapic_enable_cmc(void)
 {
 	u_int apic_id;
 
 #ifdef DEV_ATPIC
 	if (!x2apic_mode && lapic_map == NULL)
 		return;
 #endif
 	apic_id = PCPU_GET(apic_id);
 	KASSERT(lapics[apic_id].la_present,
 	    ("%s: missing APIC %u", __func__, apic_id));
 	lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_masked = 0;
 	lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_active = 1;
 	if (bootverbose)
 		printf("lapic%u: CMCI unmasked\n", apic_id);
 }
 
 static int
 native_lapic_enable_mca_elvt(void)
 {
 	u_int apic_id;
 	uint32_t value;
 	int elvt_count;
 
 #ifdef DEV_ATPIC
 	if (lapic_map == NULL)
 		return (-1);
 #endif
 
 	apic_id = PCPU_GET(apic_id);
 	KASSERT(lapics[apic_id].la_present,
 	    ("%s: missing APIC %u", __func__, apic_id));
 	elvt_count = amd_read_elvt_count();
 	if (elvt_count <= APIC_ELVT_MCA)
 		return (-1);
 
 	value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA);
 	if ((value & APIC_LVT_M) == 0) {
 		if (bootverbose)
 			printf("AMD MCE Thresholding Extended LVT is already active\n");
 		return (APIC_ELVT_MCA);
 	}
 	lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0;
 	lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1;
 	if (bootverbose)
 		printf("lapic%u: MCE Thresholding ELVT unmasked\n", apic_id);
 	return (APIC_ELVT_MCA);
 }
 
 void
 lapic_handle_error(void)
 {
 	uint32_t esr;
 
 	/*
 	 * Read the contents of the error status register.  Write to
 	 * the register first before reading from it to force the APIC
 	 * to update its value to indicate any errors that have
 	 * occurred since the previous write to the register.
 	 */
 	lapic_write32(LAPIC_ESR, 0);
 	esr = lapic_read32(LAPIC_ESR);
 
 	printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
 	lapic_eoi();
 }
 
 static u_int
 native_apic_cpuid(u_int apic_id)
 {
 #ifdef SMP
 	return apic_cpuids[apic_id];
 #else
 	return 0;
 #endif
 }
 
 /* Request a free IDT vector to be used by the specified IRQ. */
 static u_int
 native_apic_alloc_vector(u_int apic_id, u_int irq)
 {
 	u_int vector;
 
 	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
 
 	/*
 	 * Search for a free vector.  Currently we just use a very simple
 	 * algorithm to find the first free vector.
 	 */
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 		if (lapics[apic_id].la_ioint_irqs[vector] != -1)
 			continue;
 		lapics[apic_id].la_ioint_irqs[vector] = irq;
 		mtx_unlock_spin(&icu_lock);
 		return (vector + APIC_IO_INTS);
 	}
 	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
 /*
  * Request 'count' free contiguous IDT vectors to be used by 'count'
  * IRQs.  'count' must be a power of two and the vectors will be
  * aligned on a boundary of 'align'.  If the request cannot be
  * satisfied, 0 is returned.
  */
 static u_int
 native_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 	u_int first, run, vector;
 
 	KASSERT(powerof2(count), ("bad count"));
 	KASSERT(powerof2(align), ("bad align"));
 	KASSERT(align >= count, ("align < count"));
 #ifdef INVARIANTS
 	for (run = 0; run < count; run++)
 		KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u",
 		    irqs[run], run));
 #endif
 
 	/*
 	 * Search for 'count' free vectors.  As with apic_alloc_vector(),
 	 * this just uses a simple first fit algorithm.
 	 */
 	run = 0;
 	first = 0;
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 
 		/* Vector is in use, end run. */
 		if (lapics[apic_id].la_ioint_irqs[vector] != -1) {
 			run = 0;
 			first = 0;
 			continue;
 		}
 
 		/* Start a new run if run == 0 and vector is aligned. */
 		if (run == 0) {
 			if ((vector & (align - 1)) != 0)
 				continue;
 			first = vector;
 		}
 		run++;
 
 		/* Keep looping if the run isn't long enough yet. */
 		if (run < count)
 			continue;
 
 		/* Found a run, assign IRQs and return the first vector. */
 		for (vector = 0; vector < count; vector++)
 			lapics[apic_id].la_ioint_irqs[first + vector] =
 			    irqs[vector];
 		mtx_unlock_spin(&icu_lock);
 		return (first + APIC_IO_INTS);
 	}
 	mtx_unlock_spin(&icu_lock);
 	printf("APIC: Couldn't find APIC vectors for %u IRQs\n", count);
 	return (0);
 }
 
 /*
  * Enable a vector for a particular apic_id.  Since all lapics share idt
  * entries and ioint_handlers this enables the vector on all lapics.  lapics
  * which do not have the vector configured would report spurious interrupts
  * should it fire.
  */
 static void
 native_apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
 	KASSERT(ioint_handlers[vector / 32] != NULL,
 	    ("No ISR handler for vector %u", vector));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
 	    SDT_APIC, SEL_KPL, GSEL_APIC);
 }
 
 static void
 native_apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	KASSERT(ioint_handlers[vector / 32] != NULL,
 	    ("No ISR handler for vector %u", vector));
 #ifdef notyet
 	/*
 	 * We can not currently clear the idt entry because other cpus
 	 * may have a valid vector at this offset.
 	 */
-	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APIC,
 	    SEL_KPL, GSEL_APIC);
 #endif
 }
 
 /* Release an APIC vector when it's no longer in use. */
 static void
 native_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 	struct thread *td;
 
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
 	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
 	KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] ==
 	    irq, ("IRQ mismatch"));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 
 	/*
 	 * Bind us to the cpu that owned the vector before freeing it so
 	 * we don't lose an interrupt delivery race.
 	 */
 	td = curthread;
 	if (!rebooting) {
 		thread_lock(td);
 		if (sched_is_bound(td))
 			panic("apic_free_vector: Thread already bound.\n");
 		sched_bind(td, apic_cpuid(apic_id));
 		thread_unlock(td);
 	}
 	mtx_lock_spin(&icu_lock);
 	lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1;
 	mtx_unlock_spin(&icu_lock);
 	if (!rebooting) {
 		thread_lock(td);
 		sched_unbind(td);
 		thread_unlock(td);
 	}
 }
 
 /* Map an IDT vector (APIC) to an IRQ (interrupt source). */
 static u_int
 apic_idt_to_irq(u_int apic_id, u_int vector)
 {
 	int irq;
 
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	irq = lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS];
 	if (irq < 0)
 		irq = 0;
 	return (irq);
 }
 
 #ifdef DDB
 /*
  * Dump data about APIC IDT vector mappings.
  */
 DB_SHOW_COMMAND(apic, db_show_apic)
 {
 	struct intsrc *isrc;
 	int i, verbose;
 	u_int apic_id;
 	u_int irq;
 
 	if (strcmp(modif, "vv") == 0)
 		verbose = 2;
 	else if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
 	for (apic_id = 0; apic_id <= max_apic_id; apic_id++) {
 		if (lapics[apic_id].la_present == 0)
 			continue;
 		db_printf("Interrupts bound to lapic %u\n", apic_id);
 		for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) {
 			irq = lapics[apic_id].la_ioint_irqs[i];
 			if (irq == -1 || irq == IRQ_SYSCALL)
 				continue;
 #ifdef KDTRACE_HOOKS
 			if (irq == IRQ_DTRACE_RET)
 				continue;
 #endif
 #ifdef XENHVM
 			if (irq == IRQ_EVTCHN)
 				continue;
 #endif
 			db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
 			if (irq == IRQ_TIMER)
 				db_printf("lapic timer\n");
 			else if (irq < NUM_IO_INTS) {
 				isrc = intr_lookup_source(irq);
 				if (isrc == NULL || verbose == 0)
 					db_printf("IRQ %u\n", irq);
 				else
 					db_dump_intr_event(isrc->is_event,
 					    verbose == 2);
 			} else
 				db_printf("IRQ %u ???\n", irq);
 		}
 	}
 }
 
 static void
 dump_mask(const char *prefix, uint32_t v, int base)
 {
 	int i, first;
 
 	first = 1;
 	for (i = 0; i < 32; i++)
 		if (v & (1 << i)) {
 			if (first) {
 				db_printf("%s:", prefix);
 				first = 0;
 			}
 			db_printf(" %02x", base + i);
 		}
 	if (!first)
 		db_printf("\n");
 }
 
 /* Show info from the lapic regs for this CPU. */
 DB_SHOW_COMMAND(lapic, db_show_lapic)
 {
 	uint32_t v;
 
 	db_printf("lapic ID = %d\n", lapic_id());
 	v = lapic_read32(LAPIC_VERSION);
 	db_printf("version  = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
 	    v & 0xf);
 	db_printf("max LVT  = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
 	v = lapic_read32(LAPIC_SVR);
 	db_printf("SVR      = %02x (%s)\n", v & APIC_SVR_VECTOR,
 	    v & APIC_SVR_ENABLE ? "enabled" : "disabled");
 	db_printf("TPR      = %02x\n", lapic_read32(LAPIC_TPR));
 
 #define dump_field(prefix, regn, index)					\
 	dump_mask(__XSTRING(prefix ## index), 				\
 	    lapic_read32(LAPIC_ ## regn ## index),			\
 	    index * 32)
 
 	db_printf("In-service Interrupts:\n");
 	dump_field(isr, ISR, 0);
 	dump_field(isr, ISR, 1);
 	dump_field(isr, ISR, 2);
 	dump_field(isr, ISR, 3);
 	dump_field(isr, ISR, 4);
 	dump_field(isr, ISR, 5);
 	dump_field(isr, ISR, 6);
 	dump_field(isr, ISR, 7);
 
 	db_printf("TMR Interrupts:\n");
 	dump_field(tmr, TMR, 0);
 	dump_field(tmr, TMR, 1);
 	dump_field(tmr, TMR, 2);
 	dump_field(tmr, TMR, 3);
 	dump_field(tmr, TMR, 4);
 	dump_field(tmr, TMR, 5);
 	dump_field(tmr, TMR, 6);
 	dump_field(tmr, TMR, 7);
 
 	db_printf("IRR Interrupts:\n");
 	dump_field(irr, IRR, 0);
 	dump_field(irr, IRR, 1);
 	dump_field(irr, IRR, 2);
 	dump_field(irr, IRR, 3);
 	dump_field(irr, IRR, 4);
 	dump_field(irr, IRR, 5);
 	dump_field(irr, IRR, 6);
 	dump_field(irr, IRR, 7);
 
 #undef dump_field
 }
 #endif
 
 /*
  * APIC probing support code.  This includes code to manage enumerators.
  */
 
 static SLIST_HEAD(, apic_enumerator) enumerators =
 	SLIST_HEAD_INITIALIZER(enumerators);
 static struct apic_enumerator *best_enum;
 
 void
 apic_register_enumerator(struct apic_enumerator *enumerator)
 {
 #ifdef INVARIANTS
 	struct apic_enumerator *apic_enum;
 
 	SLIST_FOREACH(apic_enum, &enumerators, apic_next) {
 		if (apic_enum == enumerator)
 			panic("%s: Duplicate register of %s", __func__,
 			    enumerator->apic_name);
 	}
 #endif
 	SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next);
 }
 
 /*
  * We have to look for CPU's very, very early because certain subsystems
  * want to know how many CPU's we have extremely early on in the boot
  * process.
  */
 static void
 apic_init(void *dummy __unused)
 {
 	struct apic_enumerator *enumerator;
 	int retval, best;
 
 	/* We only support built in local APICs. */
 	if (!(cpu_feature & CPUID_APIC))
 		return;
 
 	/* Don't probe if APIC mode is disabled. */
 	if (resource_disabled("apic", 0))
 		return;
 
 	/* Probe all the enumerators to find the best match. */
 	best_enum = NULL;
 	best = 0;
 	SLIST_FOREACH(enumerator, &enumerators, apic_next) {
 		retval = enumerator->apic_probe();
 		if (retval > 0)
 			continue;
 		if (best_enum == NULL || best < retval) {
 			best_enum = enumerator;
 			best = retval;
 		}
 	}
 	if (best_enum == NULL) {
 		if (bootverbose)
 			printf("APIC: Could not find any APICs.\n");
 #ifndef DEV_ATPIC
 		panic("running without device atpic requires a local APIC");
 #endif
 		return;
 	}
 
 	if (bootverbose)
 		printf("APIC: Using the %s enumerator.\n",
 		    best_enum->apic_name);
 
 #ifdef I686_CPU
 	/*
 	 * To work around an errata, we disable the local APIC on some
 	 * CPUs during early startup.  We need to turn the local APIC back
 	 * on on such CPUs now.
 	 */
 	ppro_reenable_apic();
 #endif
 
 	/* Probe the CPU's in the system. */
 	retval = best_enum->apic_probe_cpus();
 	if (retval != 0)
 		printf("%s: Failed to probe CPUs: returned %d\n",
 		    best_enum->apic_name, retval);
 
 }
 SYSINIT(apic_init, SI_SUB_TUNABLES - 1, SI_ORDER_SECOND, apic_init, NULL);
 
 /*
  * Setup the local APIC.  We have to do this prior to starting up the APs
  * in the SMP case.
  */
 static void
 apic_setup_local(void *dummy __unused)
 {
 	int retval;
 
 	if (best_enum == NULL)
 		return;
 
 	lapics = malloc(sizeof(*lapics) * (max_apic_id + 1), M_LAPIC,
 	    M_WAITOK | M_ZERO);
 
 	/* Initialize the local APIC. */
 	retval = best_enum->apic_setup_local();
 	if (retval != 0)
 		printf("%s: Failed to setup the local APIC: returned %d\n",
 		    best_enum->apic_name, retval);
 }
 SYSINIT(apic_setup_local, SI_SUB_CPU, SI_ORDER_SECOND, apic_setup_local, NULL);
 
 /*
  * Setup the I/O APICs.
  */
 static void
 apic_setup_io(void *dummy __unused)
 {
 	int retval;
 
 	if (best_enum == NULL)
 		return;
 
 	/*
 	 * Local APIC must be registered before other PICs and pseudo PICs
 	 * for proper suspend/resume order.
 	 */
 	intr_register_pic(&lapic_pic);
 
 	retval = best_enum->apic_setup_io();
 	if (retval != 0)
 		printf("%s: Failed to setup I/O APICs: returned %d\n",
 		    best_enum->apic_name, retval);
 
 	/*
 	 * Finish setting up the local APIC on the BSP once we know
 	 * how to properly program the LINT pins.  In particular, this
 	 * enables the EOI suppression mode, if LAPIC support it and
 	 * user did not disabled the mode.
 	 */
 	lapic_setup(1);
 	if (bootverbose)
 		lapic_dump("BSP");
 
 	/* Enable the MSI "pic". */
 	init_ops.msi_init();
 }
 SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL);
 
 #ifdef SMP
 /*
  * Inter Processor Interrupt functions.  The lapic_ipi_*() functions are
  * private to the MD code.  The public interface for the rest of the
  * kernel is defined in mp_machdep.c.
  */
 
 /*
  * Wait delay microseconds for IPI to be sent.  If delay is -1, we
  * wait forever.
  */
 static int
 native_lapic_ipi_wait(int delay)
 {
 	uint64_t rx;
 
 	/* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */
 	if (x2apic_mode)
 		return (1);
 
 	for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) {
 		if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) ==
 		    APIC_DELSTAT_IDLE)
 			return (1);
 		ia32_pause();
 	}
 	return (0);
 }
 
 static void
 native_lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 	uint64_t icr;
 	uint32_t vhi, vlo;
 	register_t saveintr;
 
 	/* XXX: Need more sanity checking of icrlo? */
 	KASSERT(x2apic_mode || lapic_map != NULL,
 	    ("%s called too early", __func__));
 	KASSERT(x2apic_mode ||
 	    (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 	    ("%s: invalid dest field", __func__));
 	KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
 	    ("%s: reserved bits set in ICR LO register", __func__));
 
 	/* Set destination in ICR HI register if it is being used. */
 	if (!x2apic_mode) {
 		saveintr = intr_disable();
 		icr = lapic_read_icr();
 	}
 
 	if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
 		if (x2apic_mode) {
 			vhi = dest;
 		} else {
 			vhi = icr >> 32;
 			vhi &= ~APIC_ID_MASK;
 			vhi |= dest << APIC_ID_SHIFT;
 		}
 	} else {
 		vhi = 0;
 	}
 
 	/* Program the contents of the IPI and dispatch it. */
 	if (x2apic_mode) {
 		vlo = icrlo;
 	} else {
 		vlo = icr;
 		vlo &= APIC_ICRLO_RESV_MASK;
 		vlo |= icrlo;
 	}
 	lapic_write_icr(vhi, vlo);
 	if (!x2apic_mode)
 		intr_restore(saveintr);
 }
 
 #define	BEFORE_SPIN	50000
 #ifdef DETECT_DEADLOCK
 #define	AFTER_SPIN	50
 #endif
 
 static void
 native_lapic_ipi_vectored(u_int vector, int dest)
 {
 	register_t icrlo, destfield;
 
 	KASSERT((vector & ~APIC_VECTOR_MASK) == 0,
 	    ("%s: invalid vector %d", __func__, vector));
 
 	icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT;
 
 	/*
 	 * NMI IPIs are just fake vectors used to send a NMI.  Use special rules
 	 * regarding NMIs if passed, otherwise specify the vector.
 	 */
 	if (vector >= IPI_NMI_FIRST)
 		icrlo |= APIC_DELMODE_NMI;
 	else
 		icrlo |= vector | APIC_DELMODE_FIXED;
 	destfield = 0;
 	switch (dest) {
 	case APIC_IPI_DEST_SELF:
 		icrlo |= APIC_DEST_SELF;
 		break;
 	case APIC_IPI_DEST_ALL:
 		icrlo |= APIC_DEST_ALLISELF;
 		break;
 	case APIC_IPI_DEST_OTHERS:
 		icrlo |= APIC_DEST_ALLESELF;
 		break;
 	default:
 		KASSERT(x2apic_mode ||
 		    (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 		    ("%s: invalid destination 0x%x", __func__, dest));
 		destfield = dest;
 	}
 
 	/* Wait for an earlier IPI to finish. */
 	if (!lapic_ipi_wait(BEFORE_SPIN)) {
 		if (panicstr != NULL)
 			return;
 		else
 			panic("APIC: Previous IPI is stuck");
 	}
 
 	lapic_ipi_raw(icrlo, destfield);
 
 #ifdef DETECT_DEADLOCK
 	/* Wait for IPI to be delivered. */
 	if (!lapic_ipi_wait(AFTER_SPIN)) {
 #ifdef needsattention
 		/*
 		 * XXX FIXME:
 		 *
 		 * The above function waits for the message to actually be
 		 * delivered.  It breaks out after an arbitrary timeout
 		 * since the message should eventually be delivered (at
 		 * least in theory) and that if it wasn't we would catch
 		 * the failure with the check above when the next IPI is
 		 * sent.
 		 *
 		 * We could skip this wait entirely, EXCEPT it probably
 		 * protects us from other routines that assume that the
 		 * message was delivered and acted upon when this function
 		 * returns.
 		 */
 		printf("APIC: IPI might be stuck\n");
 #else /* !needsattention */
 		/* Wait until mesage is sent without a timeout. */
 		while (lapic_read_icr_lo() & APIC_DELSTAT_PEND)
 			ia32_pause();
 #endif /* needsattention */
 	}
 #endif /* DETECT_DEADLOCK */
 }
 
 #endif /* SMP */
 
 /*
  * Since the IDT is shared by all CPUs the IPI slot update needs to be globally
  * visible.
  *
  * Consider the case where an IPI is generated immediately after allocation:
  *     vector = lapic_ipi_alloc(ipifunc);
  *     ipi_selected(other_cpus, vector);
  *
  * In xAPIC mode a write to ICR_LO has serializing semantics because the
  * APIC page is mapped as an uncached region. In x2APIC mode there is an
  * explicit 'mfence' before the ICR MSR is written. Therefore in both cases
  * the IDT slot update is globally visible before the IPI is delivered.
  */
 static int
 native_lapic_ipi_alloc(inthand_t *ipifunc)
 {
 	struct gate_descriptor *ip;
 	long func;
 	int idx, vector;
 
 	KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
 	    ("invalid ipifunc %p", ipifunc));
 
 	vector = -1;
 	mtx_lock_spin(&icu_lock);
 	for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) {
 		ip = &idt[idx];
 		func = (ip->gd_hioffset << 16) | ip->gd_looffset;
 		if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) ||
 		    (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) {
 			vector = idx;
 			setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC);
 			break;
 		}
 	}
 	mtx_unlock_spin(&icu_lock);
 	return (vector);
 }
 
 static void
 native_lapic_ipi_free(int vector)
 {
 	struct gate_descriptor *ip;
 	long func;
 
 	KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST,
 	    ("%s: invalid vector %d", __func__, vector));
 
 	mtx_lock_spin(&icu_lock);
 	ip = &idt[vector];
 	func = (ip->gd_hioffset << 16) | ip->gd_looffset;
 	KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
 	    func != (uintptr_t)&IDTVEC(rsvd_pti),
 	    ("invalid idtfunc %#lx", func));
-	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APIC,
 	    SEL_KPL, GSEL_APIC);
 	mtx_unlock_spin(&icu_lock);
 }
Index: head/sys/x86/x86/mp_x86.c
===================================================================
--- head/sys/x86/x86/mp_x86.c	(revision 332488)
+++ head/sys/x86/x86/mp_x86.c	(revision 332489)
@@ -1,1733 +1,1741 @@
 /*-
  * Copyright (c) 1996, by Steve Passe
  * Copyright (c) 2003, by Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef __i386__
 #include "opt_apic.h"
 #endif
 #include "opt_cpu.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_pmap.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cons.h>	/* cngetc() */
 #include <sys/cpuset.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/cpu.h>
 
 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items");
 
 /* lock region used by kernel profiling */
 int	mcount_lock;
 
 int	mp_naps;		/* # of Applications processors */
 int	boot_cpu_id = -1;	/* designated BSP */
 
 /* AP uses this during bootstrap.  Do not staticize.  */
 char *bootSTK;
 int bootAP;
 
 /* Free these after use */
 void *bootstacks[MAXCPU];
 void *dpcpu;
 
 struct pcb stoppcbs[MAXCPU];
 struct susppcb **susppcbs;
 
 #ifdef COUNT_IPIS
 /* Interrupt counts. */
 static u_long *ipi_preempt_counts[MAXCPU];
 static u_long *ipi_ast_counts[MAXCPU];
 u_long *ipi_invltlb_counts[MAXCPU];
 u_long *ipi_invlrng_counts[MAXCPU];
 u_long *ipi_invlpg_counts[MAXCPU];
 u_long *ipi_invlcache_counts[MAXCPU];
 u_long *ipi_rendezvous_counts[MAXCPU];
 static u_long *ipi_hardclock_counts[MAXCPU];
 #endif
 
 /* Default cpu_ops implementation. */
 struct cpu_ops cpu_ops;
 
 /*
  * Local data and functions.
  */
 
 static volatile cpuset_t ipi_stop_nmi_pending;
 
 volatile cpuset_t resuming_cpus;
 volatile cpuset_t toresume_cpus;
 
 /* used to hold the AP's until we are ready to release them */
 struct mtx ap_boot_mtx;
 
 /* Set to 1 once we're ready to let the APs out of the pen. */
 volatile int aps_ready = 0;
 
 /*
  * Store data from cpu_add() until later in the boot when we actually setup
  * the APs.
  */
 struct cpu_info *cpu_info;
 int *apic_cpuids;
 int cpu_apic_ids[MAXCPU];
 _Static_assert(MAXCPU <= MAX_APIC_ID,
     "MAXCPU cannot be larger that MAX_APIC_ID");
 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID,
     "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID");
 
 /* Holds pending bitmap based IPIs per CPU */
 volatile u_int cpu_ipi_pending[MAXCPU];
 
 static void	release_aps(void *dummy);
 static void	cpustop_handler_post(u_int cpu);
 
 static int	hyperthreading_allowed = 1;
 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
 	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
 
 static struct topo_node topo_root;
 
 static int pkg_id_shift;
 static int node_id_shift;
 static int core_id_shift;
 static int disabled_cpus;
 
 struct cache_info {
 	int	id_shift;
 	int	present;
 } static caches[MAX_CACHE_LEVELS];
 
 unsigned int boot_address;
 
 #define MiB(v)	(v ## ULL << 20)
 
 void
 mem_range_AP_init(void)
 {
 
 	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
 		mem_range_softc.mr_op->initAP(&mem_range_softc);
 }
 
 /*
  * Round up to the next power of two, if necessary, and then
  * take log2.
  * Returns -1 if argument is zero.
  */
 static __inline int
 mask_width(u_int x)
 {
 
 	return (fls(x << (1 - powerof2(x))) - 1);
 }
 
 /*
  * Add a cache level to the cache topology description.
  */
 static int
 add_deterministic_cache(int type, int level, int share_count)
 {
 
 	if (type == 0)
 		return (0);
 	if (type > 3) {
 		printf("unexpected cache type %d\n", type);
 		return (1);
 	}
 	if (type == 2) /* ignore instruction cache */
 		return (1);
 	if (level == 0 || level > MAX_CACHE_LEVELS) {
 		printf("unexpected cache level %d\n", type);
 		return (1);
 	}
 
 	if (caches[level - 1].present) {
 		printf("WARNING: multiple entries for L%u data cache\n", level);
 		printf("%u => %u\n", caches[level - 1].id_shift,
 		    mask_width(share_count));
 	}
 	caches[level - 1].id_shift = mask_width(share_count);
 	caches[level - 1].present = 1;
 
 	if (caches[level - 1].id_shift > pkg_id_shift) {
 		printf("WARNING: L%u data cache covers more "
 		    "APIC IDs than a package (%u > %u)\n", level,
 		    caches[level - 1].id_shift, pkg_id_shift);
 		caches[level - 1].id_shift = pkg_id_shift;
 	}
 	if (caches[level - 1].id_shift < core_id_shift) {
 		printf("WARNING: L%u data cache covers fewer "
 		    "APIC IDs than a core (%u < %u)\n", level,
 		    caches[level - 1].id_shift, core_id_shift);
 		caches[level - 1].id_shift = core_id_shift;
 	}
 
 	return (1);
 }
 
 /*
  * Determine topology of processing units and caches for AMD CPUs.
  * See:
  *  - AMD CPUID Specification (Publication # 25481)
  *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
  *  - BKDG For AMD Family 10h Processors (Publication # 31116)
  *  - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
  *  - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
  */
 static void
 topo_probe_amd(void)
 {
 	u_int p[4];
 	uint64_t v;
 	int level;
 	int nodes_per_socket;
 	int share_count;
 	int type;
 	int i;
 
 	/* No multi-core capability. */
 	if ((amd_feature2 & AMDID2_CMP) == 0)
 		return;
 
 	/* For families 10h and newer. */
 	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
 	    AMDID_COREID_SIZE_SHIFT;
 
 	/* For 0Fh family. */
 	if (pkg_id_shift == 0)
 		pkg_id_shift =
 		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
 
 	/*
 	 * Families prior to 16h define the following value as
 	 * cores per compute unit and we don't really care about the AMD
 	 * compute units at the moment.  Perhaps we should treat them as
 	 * cores and cores within the compute units as hardware threads,
 	 * but that's up for debate.
 	 * Later families define the value as threads per compute unit,
 	 * so we are following AMD's nomenclature here.
 	 */
 	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
 	    CPUID_TO_FAMILY(cpu_id) >= 0x16) {
 		cpuid_count(0x8000001e, 0, p);
 		share_count = ((p[1] >> 8) & 0xff) + 1;
 		core_id_shift = mask_width(share_count);
 
 		/*
 		 * For Zen (17h), gather Nodes per Processor.  Each node is a
 		 * Zeppelin die; TR and EPYC CPUs will have multiple dies per
 		 * package.  Communication latency between dies is higher than
 		 * within them.
 		 */
 		nodes_per_socket = ((p[2] >> 8) & 0x7) + 1;
 		node_id_shift = pkg_id_shift - mask_width(nodes_per_socket);
 	}
 
 	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
 		for (i = 0; ; i++) {
 			cpuid_count(0x8000001d, i, p);
 			type = p[0] & 0x1f;
 			level = (p[0] >> 5) & 0x7;
 			share_count = 1 + ((p[0] >> 14) & 0xfff);
 
 			if (!add_deterministic_cache(type, level, share_count))
 				break;
 		}
 	} else {
 		if (cpu_exthigh >= 0x80000005) {
 			cpuid_count(0x80000005, 0, p);
 			if (((p[2] >> 24) & 0xff) != 0) {
 				caches[0].id_shift = 0;
 				caches[0].present = 1;
 			}
 		}
 		if (cpu_exthigh >= 0x80000006) {
 			cpuid_count(0x80000006, 0, p);
 			if (((p[2] >> 16) & 0xffff) != 0) {
 				caches[1].id_shift = 0;
 				caches[1].present = 1;
 			}
 			if (((p[3] >> 18) & 0x3fff) != 0) {
 				nodes_per_socket = 1;
 				if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
 					/*
 					 * Handle multi-node processors that
 					 * have multiple chips, each with its
 					 * own L3 cache, on the same die.
 					 */
 					v = rdmsr(0xc001100c);
 					nodes_per_socket = 1 + ((v >> 3) & 0x7);
 				}
 				caches[2].id_shift =
 				    pkg_id_shift - mask_width(nodes_per_socket);
 				caches[2].present = 1;
 			}
 		}
 	}
 }
 
 /*
  * Determine topology of processing units for Intel CPUs
  * using CPUID Leaf 1 and Leaf 4, if supported.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  */
 static void
 topo_probe_intel_0x4(void)
 {
 	u_int p[4];
 	int max_cores;
 	int max_logical;
 
 	/* Both zero and one here mean one logical processor per package. */
 	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
 	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
 	if (max_logical <= 1)
 		return;
 
 	if (cpu_high >= 0x4) {
 		cpuid_count(0x04, 0, p);
 		max_cores = ((p[0] >> 26) & 0x3f) + 1;
 	} else
 		max_cores = 1;
 
 	core_id_shift = mask_width(max_logical/max_cores);
 	KASSERT(core_id_shift >= 0,
 	    ("intel topo: max_cores > max_logical\n"));
 	pkg_id_shift = core_id_shift + mask_width(max_cores);
 }
 
 /*
  * Determine topology of processing units for Intel CPUs
  * using CPUID Leaf 11, if supported.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  */
 static void
 topo_probe_intel_0xb(void)
 {
 	u_int p[4];
 	int bits;
 	int type;
 	int i;
 
 	/* Fall back if CPU leaf 11 doesn't really exist. */
 	cpuid_count(0x0b, 0, p);
 	if (p[1] == 0) {
 		topo_probe_intel_0x4();
 		return;
 	}
 
 	/* We only support three levels for now. */
 	for (i = 0; ; i++) {
 		cpuid_count(0x0b, i, p);
 
 		bits = p[0] & 0x1f;
 		type = (p[2] >> 8) & 0xff;
 
 		if (type == 0)
 			break;
 
 		/* TODO: check for duplicate (re-)assignment */
 		if (type == CPUID_TYPE_SMT)
 			core_id_shift = bits;
 		else if (type == CPUID_TYPE_CORE)
 			pkg_id_shift = bits;
 		else
 			printf("unknown CPU level type %d\n", type);
 	}
 
 	if (pkg_id_shift < core_id_shift) {
 		printf("WARNING: core covers more APIC IDs than a package\n");
 		core_id_shift = pkg_id_shift;
 	}
 }
 
 /*
  * Determine topology of caches for Intel CPUs.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 Architectures Software Developer’s Manual
  *    Volume 2A: Instruction Set Reference, A-M,
  *    CPUID instruction
  */
 static void
 topo_probe_intel_caches(void)
 {
 	u_int p[4];
 	int level;
 	int share_count;
 	int type;
 	int i;
 
 	if (cpu_high < 0x4) {
 		/*
 		 * Available cache level and sizes can be determined
 		 * via CPUID leaf 2, but that requires a huge table of hardcoded
 		 * values, so for now just assume L1 and L2 caches potentially
 		 * shared only by HTT processing units, if HTT is present.
 		 */
 		caches[0].id_shift = pkg_id_shift;
 		caches[0].present = 1;
 		caches[1].id_shift = pkg_id_shift;
 		caches[1].present = 1;
 		return;
 	}
 
 	for (i = 0; ; i++) {
 		cpuid_count(0x4, i, p);
 		type = p[0] & 0x1f;
 		level = (p[0] >> 5) & 0x7;
 		share_count = 1 + ((p[0] >> 14) & 0xfff);
 
 		if (!add_deterministic_cache(type, level, share_count))
 			break;
 	}
 }
 
 /*
  * Determine topology of processing units and caches for Intel CPUs.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  */
 static void
 topo_probe_intel(void)
 {
 
 	/*
 	 * Note that 0x1 <= cpu_high < 4 case should be
 	 * compatible with topo_probe_intel_0x4() logic when
 	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
 	 * or it should trigger the fallback otherwise.
 	 */
 	if (cpu_high >= 0xb)
 		topo_probe_intel_0xb();
 	else if (cpu_high >= 0x1)
 		topo_probe_intel_0x4();
 
 	topo_probe_intel_caches();
 }
 
 /*
  * Topology information is queried only on BSP, on which this
  * code runs and for which it can query CPUID information.
  * Then topology is extrapolated on all packages using an
  * assumption that APIC ID to hardware component ID mapping is
  * homogenious.
  * That doesn't necesserily imply that the topology is uniform.
  */
 void
 topo_probe(void)
 {
 	static int cpu_topo_probed = 0;
 	struct x86_topo_layer {
 		int type;
 		int subtype;
 		int id_shift;
 	} topo_layers[MAX_CACHE_LEVELS + 4];
 	struct topo_node *parent;
 	struct topo_node *node;
 	int layer;
 	int nlayers;
 	int node_id;
 	int i;
 
 	if (cpu_topo_probed)
 		return;
 
 	CPU_ZERO(&logical_cpus_mask);
 
 	if (mp_ncpus <= 1)
 		; /* nothing */
 	else if (cpu_vendor_id == CPU_VENDOR_AMD)
 		topo_probe_amd();
 	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
 		topo_probe_intel();
 
 	KASSERT(pkg_id_shift >= core_id_shift,
 	    ("bug in APIC topology discovery"));
 
 	nlayers = 0;
 	bzero(topo_layers, sizeof(topo_layers));
 
 	topo_layers[nlayers].type = TOPO_TYPE_PKG;
 	topo_layers[nlayers].id_shift = pkg_id_shift;
 	if (bootverbose)
 		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
 	nlayers++;
 
 	if (pkg_id_shift > node_id_shift && node_id_shift != 0) {
 		topo_layers[nlayers].type = TOPO_TYPE_GROUP;
 		topo_layers[nlayers].id_shift = node_id_shift;
 		if (bootverbose)
 			printf("Node ID shift: %u\n",
 			    topo_layers[nlayers].id_shift);
 		nlayers++;
 	}
 
 	/*
 	 * Consider all caches to be within a package/chip
 	 * and "in front" of all sub-components like
 	 * cores and hardware threads.
 	 */
 	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
 		if (caches[i].present) {
 			if (node_id_shift != 0)
 				KASSERT(caches[i].id_shift <= node_id_shift,
 					("bug in APIC topology discovery"));
 			KASSERT(caches[i].id_shift <= pkg_id_shift,
 				("bug in APIC topology discovery"));
 			KASSERT(caches[i].id_shift >= core_id_shift,
 				("bug in APIC topology discovery"));
 
 			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
 			topo_layers[nlayers].subtype = i + 1;
 			topo_layers[nlayers].id_shift = caches[i].id_shift;
 			if (bootverbose)
 				printf("L%u cache ID shift: %u\n",
 				    topo_layers[nlayers].subtype,
 				    topo_layers[nlayers].id_shift);
 			nlayers++;
 		}
 	}
 
 	if (pkg_id_shift > core_id_shift) {
 		topo_layers[nlayers].type = TOPO_TYPE_CORE;
 		topo_layers[nlayers].id_shift = core_id_shift;
 		if (bootverbose)
 			printf("Core ID shift: %u\n",
 			    topo_layers[nlayers].id_shift);
 		nlayers++;
 	}
 
 	topo_layers[nlayers].type = TOPO_TYPE_PU;
 	topo_layers[nlayers].id_shift = 0;
 	nlayers++;
 
 	topo_init_root(&topo_root);
 	for (i = 0; i <= max_apic_id; ++i) {
 		if (!cpu_info[i].cpu_present)
 			continue;
 
 		parent = &topo_root;
 		for (layer = 0; layer < nlayers; ++layer) {
 			node_id = i >> topo_layers[layer].id_shift;
 			parent = topo_add_node_by_hwid(parent, node_id,
 			    topo_layers[layer].type,
 			    topo_layers[layer].subtype);
 		}
 	}
 
 	parent = &topo_root;
 	for (layer = 0; layer < nlayers; ++layer) {
 		node_id = boot_cpu_id >> topo_layers[layer].id_shift;
 		node = topo_find_node_by_hwid(parent, node_id,
 		    topo_layers[layer].type,
 		    topo_layers[layer].subtype);
 		topo_promote_child(node);
 		parent = node;
 	}
 
 	cpu_topo_probed = 1;
 }
 
 /*
  * Assign logical CPU IDs to local APICs.
  */
 void
 assign_cpu_ids(void)
 {
 	struct topo_node *node;
 	u_int smt_mask;
 
 	smt_mask = (1u << core_id_shift) - 1;
 
 	/*
 	 * Assign CPU IDs to local APIC IDs and disable any CPUs
 	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
 	 */
 	mp_ncpus = 0;
 	TOPO_FOREACH(node, &topo_root) {
 		if (node->type != TOPO_TYPE_PU)
 			continue;
 
 		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
 			cpu_info[node->hwid].cpu_hyperthread = 1;
 
 		if (resource_disabled("lapic", node->hwid)) {
 			if (node->hwid != boot_cpu_id)
 				cpu_info[node->hwid].cpu_disabled = 1;
 			else
 				printf("Cannot disable BSP, APIC ID = %d\n",
 				    node->hwid);
 		}
 
 		if (!hyperthreading_allowed &&
 		    cpu_info[node->hwid].cpu_hyperthread)
 			cpu_info[node->hwid].cpu_disabled = 1;
 
 		if (mp_ncpus >= MAXCPU)
 			cpu_info[node->hwid].cpu_disabled = 1;
 
 		if (cpu_info[node->hwid].cpu_disabled) {
 			disabled_cpus++;
 			continue;
 		}
 
 		cpu_apic_ids[mp_ncpus] = node->hwid;
 		apic_cpuids[node->hwid] = mp_ncpus;
 		topo_set_pu_id(node, mp_ncpus);
 		mp_ncpus++;
 	}
 
 	KASSERT(mp_maxid >= mp_ncpus - 1,
 	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
 	    mp_ncpus));
 }
 
 /*
  * Print various information about the SMP system hardware and setup.
  */
 void
 cpu_mp_announce(void)
 {
 	struct topo_node *node;
 	const char *hyperthread;
 	struct topo_analysis topology;
 
 	printf("FreeBSD/SMP: ");
 	if (topo_analyze(&topo_root, 1, &topology)) {
 		printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]);
 		if (topology.entities[TOPO_LEVEL_GROUP] > 1)
 			printf(" x %d groups",
 			    topology.entities[TOPO_LEVEL_GROUP]);
 		if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
 			printf(" x %d cache groups",
 			    topology.entities[TOPO_LEVEL_CACHEGROUP]);
 		if (topology.entities[TOPO_LEVEL_CORE] > 0)
 			printf(" x %d core(s)",
 			    topology.entities[TOPO_LEVEL_CORE]);
 		if (topology.entities[TOPO_LEVEL_THREAD] > 1)
 			printf(" x %d hardware threads",
 			    topology.entities[TOPO_LEVEL_THREAD]);
 	} else {
 		printf("Non-uniform topology");
 	}
 	printf("\n");
 
 	if (disabled_cpus) {
 		printf("FreeBSD/SMP Online: ");
 		if (topo_analyze(&topo_root, 0, &topology)) {
 			printf("%d package(s)",
 			    topology.entities[TOPO_LEVEL_PKG]);
 			if (topology.entities[TOPO_LEVEL_GROUP] > 1)
 				printf(" x %d groups",
 				    topology.entities[TOPO_LEVEL_GROUP]);
 			if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
 				printf(" x %d cache groups",
 				    topology.entities[TOPO_LEVEL_CACHEGROUP]);
 			if (topology.entities[TOPO_LEVEL_CORE] > 0)
 				printf(" x %d core(s)",
 				    topology.entities[TOPO_LEVEL_CORE]);
 			if (topology.entities[TOPO_LEVEL_THREAD] > 1)
 				printf(" x %d hardware threads",
 				    topology.entities[TOPO_LEVEL_THREAD]);
 		} else {
 			printf("Non-uniform topology");
 		}
 		printf("\n");
 	}
 
 	if (!bootverbose)
 		return;
 
 	TOPO_FOREACH(node, &topo_root) {
 		switch (node->type) {
 		case TOPO_TYPE_PKG:
 			printf("Package HW ID = %u\n", node->hwid);
 			break;
 		case TOPO_TYPE_CORE:
 			printf("\tCore HW ID = %u\n", node->hwid);
 			break;
 		case TOPO_TYPE_PU:
 			if (cpu_info[node->hwid].cpu_hyperthread)
 				hyperthread = "/HT";
 			else
 				hyperthread = "";
 
 			if (node->subtype == 0)
 				printf("\t\tCPU (AP%s): APIC ID: %u"
 				    "(disabled)\n", hyperthread, node->hwid);
 			else if (node->id == 0)
 				printf("\t\tCPU0 (BSP): APIC ID: %u\n",
 				    node->hwid);
 			else
 				printf("\t\tCPU%u (AP%s): APIC ID: %u\n",
 				    node->id, hyperthread, node->hwid);
 			break;
 		default:
 			/* ignored */
 			break;
 		}
 	}
 }
 
 /*
  * Add a scheduling group, a group of logical processors sharing
  * a particular cache (and, thus having an affinity), to the scheduling
  * topology.
  * This function recursively works on lower level caches.
  */
 static void
 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
 {
 	struct topo_node *node;
 	int nchildren;
 	int ncores;
 	int i;
 
 	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE ||
 	    root->type == TOPO_TYPE_GROUP,
 	    ("x86topo_add_sched_group: bad type: %u", root->type));
 	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
 	cg_root->cg_count = root->cpu_count;
 	if (root->type == TOPO_TYPE_SYSTEM)
 		cg_root->cg_level = CG_SHARE_NONE;
 	else
 		cg_root->cg_level = root->subtype;
 
 	/*
 	 * Check how many core nodes we have under the given root node.
 	 * If we have multiple logical processors, but not multiple
 	 * cores, then those processors must be hardware threads.
 	 */
 	ncores = 0;
 	node = root;
 	while (node != NULL) {
 		if (node->type != TOPO_TYPE_CORE) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 
 		ncores++;
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	if (cg_root->cg_level != CG_SHARE_NONE &&
 	    root->cpu_count > 1 && ncores < 2)
 		cg_root->cg_flags = CG_FLAG_SMT;
 
 	/*
 	 * Find out how many cache nodes we have under the given root node.
 	 * We ignore cache nodes that cover all the same processors as the
 	 * root node.  Also, we do not descend below found cache nodes.
 	 * That is, we count top-level "non-redundant" caches under the root
 	 * node.
 	 */
 	nchildren = 0;
 	node = root;
 	while (node != NULL) {
 		if ((node->type != TOPO_TYPE_GROUP &&
 		    node->type != TOPO_TYPE_CACHE) ||
 		    (root->type != TOPO_TYPE_SYSTEM &&
 		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		nchildren++;
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	cg_root->cg_child = smp_topo_alloc(nchildren);
 	cg_root->cg_children = nchildren;
 
 	/*
 	 * Now find again the same cache nodes as above and recursively
 	 * build scheduling topologies for them.
 	 */
 	node = root;
 	i = 0;
 	while (node != NULL) {
 		if ((node->type != TOPO_TYPE_GROUP &&
 		    node->type != TOPO_TYPE_CACHE) ||
 		    (root->type != TOPO_TYPE_SYSTEM &&
 		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		cg_root->cg_child[i].cg_parent = cg_root;
 		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
 		i++;
 		node = topo_next_nonchild_node(root, node);
 	}
 }
 
 /*
  * Build the MI scheduling topology from the discovered hardware topology.
  */
 struct cpu_group *
 cpu_topo(void)
 {
 	struct cpu_group *cg_root;
 
 	if (mp_ncpus <= 1)
 		return (smp_topo_none());
 
 	cg_root = smp_topo_alloc(1);
 	x86topo_add_sched_group(&topo_root, cg_root);
 	return (cg_root);
 }
 
 static void
 cpu_alloc(void *dummy __unused)
 {
 	/*
 	 * Dynamically allocate the arrays that depend on the
 	 * maximum APIC ID.
 	 */
 	cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS,
 	    M_WAITOK | M_ZERO);
 	apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS,
 	    M_WAITOK | M_ZERO);
 }
 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL);
 
 /*
  * Add a logical CPU to the topology.
  */
 void
 cpu_add(u_int apic_id, char boot_cpu)
 {
 
 	if (apic_id > max_apic_id) {
 		panic("SMP: APIC ID %d too high", apic_id);
 		return;
 	}
 	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice",
 	    apic_id));
 	cpu_info[apic_id].cpu_present = 1;
 	if (boot_cpu) {
 		KASSERT(boot_cpu_id == -1,
 		    ("CPU %u claims to be BSP, but CPU %u already is", apic_id,
 		    boot_cpu_id));
 		boot_cpu_id = apic_id;
 		cpu_info[apic_id].cpu_bsp = 1;
 	}
 	if (bootverbose)
 		printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" :
 		    "AP");
 }
 
 void
 cpu_mp_setmaxid(void)
 {
 
 	/*
 	 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
 	 * If there were no calls to cpu_add() assume this is a UP system.
 	 */
 	if (mp_ncpus == 0)
 		mp_ncpus = 1;
 }
 
 int
 cpu_mp_probe(void)
 {
 
 	/*
 	 * Always record BSP in CPU map so that the mbuf init code works
 	 * correctly.
 	 */
 	CPU_SETOF(0, &all_cpus);
 	return (mp_ncpus > 1);
 }
 
 /* Allocate memory for the AP trampoline. */
 void
 alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx)
 {
 	unsigned int i;
 	bool allocated;
 
 	allocated = false;
 	for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
 		/*
 		 * Find a memory region big enough and below the 1MB boundary
 		 * for the trampoline code.
 		 * NB: needs to be page aligned.
 		 */
 		if (physmap[i] >= MiB(1) ||
 		    (trunc_page(physmap[i + 1]) - round_page(physmap[i])) <
 		    round_page(bootMP_size))
 			continue;
 
 		allocated = true;
 		/*
 		 * Try to steal from the end of the region to mimic previous
 		 * behaviour, else fallback to steal from the start.
 		 */
 		if (physmap[i + 1] < MiB(1)) {
 			boot_address = trunc_page(physmap[i + 1]);
 			if ((physmap[i + 1] - boot_address) < bootMP_size)
 				boot_address -= round_page(bootMP_size);
 			physmap[i + 1] = boot_address;
 		} else {
 			boot_address = round_page(physmap[i]);
 			physmap[i] = boot_address + round_page(bootMP_size);
 		}
 		if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
 			memmove(&physmap[i], &physmap[i + 2],
 			    sizeof(*physmap) * (*physmap_idx - i + 2));
 			*physmap_idx -= 2;
 		}
 		break;
 	}
 
 	if (!allocated) {
 		boot_address = basemem * 1024 - bootMP_size;
 		if (bootverbose)
 			printf(
 "Cannot find enough space for the boot trampoline, placing it at %#x",
 			    boot_address);
 	}
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary_tail(void)
 {
 	u_int cpuid;
 
 	/*
 	 * On real hardware, switch to x2apic mode if possible.  Do it
 	 * after aps_ready was signalled, to avoid manipulating the
 	 * mode while BSP might still want to send some IPI to us
 	 * (second startup IPI is ignored on modern hardware etc).
 	 */
 	lapic_xapic_mode();
 
 	/* Initialize the PAT MSR. */
 	pmap_init_pat();
 
 	/* set up CPU registers and state */
 	cpu_setregs();
 
 	/* set up SSE/NX */
 	initializecpu();
 
 	/* set up FPU state on the AP */
 #ifdef __amd64__
 	fpuinit();
 #else
 	npxinit(false);
 #endif
 
 	if (cpu_ops.cpu_init)
 		cpu_ops.cpu_init();
 
 	/* A quick check from sanity claus */
 	cpuid = PCPU_GET(cpuid);
 	if (PCPU_GET(apic_id) != lapic_id()) {
 		printf("SMP: cpuid = %d\n", cpuid);
 		printf("SMP: actual apic_id = %d\n", lapic_id());
 		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
 		panic("cpuid mismatch! boom!!");
 	}
 
 	/* Initialize curthread. */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	PCPU_SET(curthread, PCPU_GET(idlethread));
 
 	mca_init();
 
 	mtx_lock_spin(&ap_boot_mtx);
 
 	/* Init local apic for irq's */
 	lapic_setup(1);
 
 	/* Set memory range attributes for this CPU to match the BSP */
 	mem_range_AP_init();
 
 	smp_cpus++;
 
 	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
 	printf("SMP: AP CPU #%d Launched!\n", cpuid);
 
 	/* Determine if we are a logical CPU. */
 	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
 		CPU_SET(cpuid, &logical_cpus_mask);
 
 	if (bootverbose)
 		lapic_dump("AP");
 
 	if (smp_cpus == mp_ncpus) {
 		/* enable IPI's, tlb shootdown, freezes etc */
 		atomic_store_rel_int(&smp_started, 1);
 	}
 
 #ifdef __amd64__
 	/*
 	 * Enable global pages TLB extension
 	 * This also implicitly flushes the TLB 
 	 */
 	load_cr4(rcr4() | CR4_PGE);
 	if (pmap_pcid_enabled)
 		load_cr4(rcr4() | CR4_PCIDE);
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 #endif
 
 	mtx_unlock_spin(&ap_boot_mtx);
 
 	/* Wait until all the AP's are up. */
 	while (atomic_load_acq_int(&smp_started) == 0)
 		ia32_pause();
 
 #ifndef EARLY_AP_STARTUP
 	/* Start per-CPU event timers. */
 	cpu_initclocks_ap();
 #endif
 
 	sched_throw(NULL);
 
 	panic("scheduler returned us to %s", __func__);
 	/* NOTREACHED */
 }
 
 /*******************************************************************
  * local functions and data
  */
 
 /*
  * We tell the I/O APIC code about all the CPUs we want to receive
  * interrupts.  If we don't want certain CPUs to receive IRQs we
  * can simply not tell the I/O APIC code about them in this function.
  * We also do not tell it about the BSP since it tells itself about
  * the BSP internally to work with UP kernels and on UP machines.
  */
 void
 set_interrupt_apic_ids(void)
 {
 	u_int i, apic_id;
 
 	for (i = 0; i < MAXCPU; i++) {
 		apic_id = cpu_apic_ids[i];
 		if (apic_id == -1)
 			continue;
 		if (cpu_info[apic_id].cpu_bsp)
 			continue;
 		if (cpu_info[apic_id].cpu_disabled)
 			continue;
 
 		/* Don't let hyperthreads service interrupts. */
 		if (cpu_info[apic_id].cpu_hyperthread)
 			continue;
 
 		intr_add_cpu(i);
 	}
 }
 
 
 #ifdef COUNT_XINVLTLB_HITS
 u_int xhits_gbl[MAXCPU];
 u_int xhits_pg[MAXCPU];
 u_int xhits_rng[MAXCPU];
 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
     sizeof(xhits_gbl), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
     sizeof(xhits_pg), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
     sizeof(xhits_rng), "IU", "");
 
 u_int ipi_global;
 u_int ipi_page;
 u_int ipi_range;
 u_int ipi_range_size;
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
     0, "");
 #endif /* COUNT_XINVLTLB_HITS */
 
 /*
  * Init and startup IPI.
  */
 void
 ipi_startup(int apic_id, int vector)
 {
 
 	/*
 	 * This attempts to follow the algorithm described in the
 	 * Intel Multiprocessor Specification v1.4 in section B.4.
 	 * For each IPI, we allow the local APIC ~20us to deliver the
 	 * IPI.  If that times out, we panic.
 	 */
 
 	/*
 	 * first we do an INIT IPI: this INIT IPI might be run, resetting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
 	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
 	 * ignored.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
 	lapic_ipi_wait(100);
 
 	/* Explicitly deassert the INIT IPI. */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
 	    apic_id);
 
 	DELAY(10000);		/* wait ~10mS */
 
 	/*
 	 * next we do a STARTUP IPI: the previous INIT IPI might still be
 	 * latched, (P5 bug) this 1st STARTUP would then terminate
 	 * immediately, and the previously started INIT IPI would continue. OR
 	 * the previous INIT IPI has already run. and this STARTUP IPI will
 	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
 	 * will run.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	if (!lapic_ipi_wait(100))
 		panic("Failed to deliver first STARTUP IPI to APIC %d",
 		    apic_id);
 	DELAY(200);		/* wait ~200uS */
 
 	/*
 	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
 	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
 	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
 	 * recognized after hardware RESET or INIT IPI.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	if (!lapic_ipi_wait(100))
 		panic("Failed to deliver second STARTUP IPI to APIC %d",
 		    apic_id);
 
 	DELAY(200);		/* wait ~200uS */
 }
 
 /*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
 void
 ipi_send_cpu(int cpu, u_int ipi)
 {
 	u_int bitmap, old_pending, new_pending;
 
 	KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
 
 	if (IPI_IS_BITMAPED(ipi)) {
 		bitmap = 1 << ipi;
 		ipi = IPI_BITMAP_VECTOR;
 		do {
 			old_pending = cpu_ipi_pending[cpu];
 			new_pending = old_pending | bitmap;
 		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
 		    old_pending, new_pending));	
 		if (old_pending)
 			return;
 	}
 	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
 }
 
 void
 ipi_bitmap_handler(struct trapframe frame)
 {
 	struct trapframe *oldframe;
 	struct thread *td;
 	int cpu = PCPU_GET(cpuid);
 	u_int ipi_bitmap;
 
 	critical_enter();
 	td = curthread;
 	td->td_intr_nesting_level++;
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = &frame;
 	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
 	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
 #ifdef COUNT_IPIS
 		(*ipi_preempt_counts[cpu])++;
 #endif
 		sched_preempt(td);
 	}
 	if (ipi_bitmap & (1 << IPI_AST)) {
 #ifdef COUNT_IPIS
 		(*ipi_ast_counts[cpu])++;
 #endif
 		/* Nothing to do for AST */
 	}
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
 #ifdef COUNT_IPIS
 		(*ipi_hardclock_counts[cpu])++;
 #endif
 		hardclockintr();
 	}
 	td->td_intr_frame = oldframe;
 	td->td_intr_nesting_level--;
 	critical_exit();
 }
 
 /*
  * send an IPI to a set of cpus.
  */
 void
 ipi_selected(cpuset_t cpus, u_int ipi)
 {
 	int cpu;
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
 
 	while ((cpu = CPU_FFS(&cpus)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &cpus);
 		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 		ipi_send_cpu(cpu, ipi);
 	}
 }
 
 /*
  * send an IPI to a specific CPU.
  */
 void
 ipi_cpu(int cpu, u_int ipi)
 {
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
 
 	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 	ipi_send_cpu(cpu, ipi);
 }
 
 /*
  * send an IPI to all CPUs EXCEPT myself
  */
 void
 ipi_all_but_self(u_int ipi)
 {
 	cpuset_t other_cpus;
 
 	other_cpus = all_cpus;
 	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 	if (IPI_IS_BITMAPED(ipi)) {
 		ipi_selected(other_cpus, ipi);
 		return;
 	}
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
 
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 }
 
 int
 ipi_nmi_handler(void)
 {
 	u_int cpuid;
 
 	/*
 	 * As long as there is not a simple way to know about a NMI's
 	 * source, if the bitmask for the current CPU is present in
 	 * the global pending bitword an IPI_STOP_HARD has been issued
 	 * and should be handled.
 	 */
 	cpuid = PCPU_GET(cpuid);
 	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
 		return (1);
 
 	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
 	cpustop_handler();
 	return (0);
 }
 
 #ifdef DEV_ISA
 int nmi_kdb_lock;
 
 void
 nmi_call_kdb_smp(u_int type, struct trapframe *frame)
 {
 	int cpu;
 	bool call_post;
 
 	cpu = PCPU_GET(cpuid);
 	if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
 		nmi_call_kdb(cpu, type, frame);
 		call_post = false;
 	} else {
 		savectx(&stoppcbs[cpu]);
 		CPU_SET_ATOMIC(cpu, &stopped_cpus);
 		while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
 			ia32_pause();
 		call_post = true;
 	}
 	atomic_store_rel_int(&nmi_kdb_lock, 0);
 	if (call_post)
 		cpustop_handler_post(cpu);
 }
 #endif
 
 /*
  * Handle an IPI_STOP by saving our current context and spinning until we
  * are resumed.
  */
 void
 cpustop_handler(void)
 {
 	u_int cpu;
 
 	cpu = PCPU_GET(cpuid);
 
 	savectx(&stoppcbs[cpu]);
 
 	/* Indicate that we are stopped */
 	CPU_SET_ATOMIC(cpu, &stopped_cpus);
 
 	/* Wait for restart */
 	while (!CPU_ISSET(cpu, &started_cpus))
 	    ia32_pause();
 
 	cpustop_handler_post(cpu);
 }
 
 static void
 cpustop_handler_post(u_int cpu)
 {
 
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
 	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 
 	/*
 	 * We don't broadcast TLB invalidations to other CPUs when they are
 	 * stopped. Hence, we clear the TLB before resuming.
 	 */
 	invltlb_glob();
 
 #if defined(__amd64__) && defined(DDB)
 	amd64_db_resume_dbreg();
 #endif
 
 	if (cpu == 0 && cpustop_restartfunc != NULL) {
 		cpustop_restartfunc();
 		cpustop_restartfunc = NULL;
 	}
 }
 
 /*
  * Handle an IPI_SUSPEND by saving our current context and spinning until we
  * are resumed.
  */
 void
 cpususpend_handler(void)
 {
 	u_int cpu;
 
 	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
 
 	cpu = PCPU_GET(cpuid);
 	if (savectx(&susppcbs[cpu]->sp_pcb)) {
 #ifdef __amd64__
 		fpususpend(susppcbs[cpu]->sp_fpususpend);
 #else
 		npxsuspend(susppcbs[cpu]->sp_fpususpend);
 #endif
 		wbinvd();
 		CPU_SET_ATOMIC(cpu, &suspended_cpus);
 		/*
 		 * Hack for xen, which does not use resumectx() so never
 		 * uses the next clause: set resuming_cpus early so that
 		 * resume_cpus() can wait on the same bitmap for acpi and
 		 * xen.  resuming_cpus now means eventually_resumable_cpus.
 		 */
 		CPU_SET_ATOMIC(cpu, &resuming_cpus);
 	} else {
 #ifdef __amd64__
 		fpuresume(susppcbs[cpu]->sp_fpususpend);
 #else
 		npxresume(susppcbs[cpu]->sp_fpususpend);
 #endif
 		pmap_init_pat();
 		initializecpu();
 		PCPU_SET(switchtime, 0);
 		PCPU_SET(switchticks, ticks);
 
 		/* Indicate that we are resuming */
 		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	}
 
 	/* Wait for resume directive */
 	while (!CPU_ISSET(cpu, &toresume_cpus))
 		ia32_pause();
 
 #ifdef __i386__
 	/* Finish removing the identity mapping of low memory for this AP. */
 	invltlb_glob();
 #endif
 
 	if (cpu_ops.cpu_resume)
 		cpu_ops.cpu_resume();
 #ifdef __amd64__
 	if (vmm_resume_p)
 		vmm_resume_p();
 #endif
 
 	/* Resume MCA and local APIC */
 	lapic_xapic_mode();
 	mca_resume();
 	lapic_setup(0);
 
 	/* Indicate that we are resumed */
 	CPU_CLR_ATOMIC(cpu, &resuming_cpus);
 	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
 }
 
 
 void
 invlcache_handler(void)
 {
 	uint32_t generation;
 
 #ifdef COUNT_IPIS
 	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	/*
 	 * Reading the generation here allows greater parallelism
 	 * since wbinvd is a serializing instruction.  Without the
 	 * temporary, we'd wait for wbinvd to complete, then the read
 	 * would execute, then the dependent write, which must then
 	 * complete before return from interrupt.
 	 */
 	generation = smp_tlb_generation;
 	wbinvd();
 	PCPU_SET(smp_tlb_done, generation);
 }
 
 /*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
  */
 static void
 release_aps(void *dummy __unused)
 {
 
 	if (mp_ncpus == 1) 
 		return;
 	atomic_store_rel_int(&aps_ready, 1);
 	while (smp_started == 0)
 		ia32_pause();
 }
 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 
 #ifdef COUNT_IPIS
 /*
  * Setup interrupt counters for IPI handlers.
  */
 static void
 mp_ipi_intrcnt(void *dummy)
 {
 	char buf[64];
 	int i;
 
 	CPU_FOREACH(i) {
 		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
 		intrcnt_add(buf, &ipi_invltlb_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
 		intrcnt_add(buf, &ipi_invlrng_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
 		intrcnt_add(buf, &ipi_invlpg_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
 		intrcnt_add(buf, &ipi_invlcache_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
 		intrcnt_add(buf, &ipi_preempt_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
 		intrcnt_add(buf, &ipi_ast_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
 		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
 		intrcnt_add(buf, &ipi_hardclock_counts[i]);
 	}		
 }
 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
 #endif
 
 /*
  * Flush the TLB on other CPU's
  */
 
 /* Variables needed for SMP tlb shootdown. */
 vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 pmap_t smp_tlb_pmap;
 volatile uint32_t smp_tlb_generation;
 
 #ifdef __amd64__
 #define	read_eflags() read_rflags()
 #endif
 
 static void
 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
     vm_offset_t addr1, vm_offset_t addr2)
 {
 	cpuset_t other_cpus;
 	volatile uint32_t *p_cpudone;
 	uint32_t generation;
 	int cpu;
 
 	/* It is not necessary to signal other CPUs while in the debugger. */
 	if (kdb_active || panicstr != NULL)
 		return;
 
 	/*
 	 * Check for other cpus.  Return if none.
 	 */
 	if (CPU_ISFULLSET(&mask)) {
 		if (mp_ncpus <= 1)
 			return;
 	} else {
 		CPU_CLR(PCPU_GET(cpuid), &mask);
 		if (CPU_EMPTY(&mask))
 			return;
 	}
 
 	if (!(read_eflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
 	mtx_lock_spin(&smp_ipi_mtx);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
 	smp_tlb_pmap = pmap;
 	generation = ++smp_tlb_generation;
 	if (CPU_ISFULLSET(&mask)) {
 		ipi_all_but_self(vector);
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 	} else {
 		other_cpus = mask;
 		while ((cpu = CPU_FFS(&mask)) != 0) {
 			cpu--;
 			CPU_CLR(cpu, &mask);
 			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
 			    cpu, vector);
 			ipi_send_cpu(cpu, vector);
 		}
 	}
 	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &other_cpus);
 		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
 		while (*p_cpudone != generation)
 			ia32_pause();
 	}
 	mtx_unlock_spin(&smp_ipi_mtx);
 }
 
 void
 smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_global++;
 #endif
 	}
 }
 
 void
 smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_page++;
 #endif
 	}
 }
 
 void
 smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
     pmap_t pmap)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap,
 		    addr1, addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_range++;
 		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
 #endif
 	}
 }
 
 void
 smp_cache_flush(void)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL,
 		    0, 0);
 	}
 }
 
 /*
  * Handlers for TLB related IPIs
  */
 void
 invltlb_handler(void)
 {
 	uint32_t generation;
   
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	/*
 	 * Reading the generation here allows greater parallelism
 	 * since invalidating the TLB is a serializing operation.
 	 */
 	generation = smp_tlb_generation;
 	if (smp_tlb_pmap == kernel_pmap)
 		invltlb_glob();
+#ifdef __amd64__
 	else
 		invltlb();
+#endif
 	PCPU_SET(smp_tlb_done, generation);
 }
 
 void
 invlpg_handler(void)
 {
 	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	generation = smp_tlb_generation;	/* Overlap with serialization */
-	invlpg(smp_tlb_addr1);
+#ifdef __i386__
+	if (smp_tlb_pmap == kernel_pmap)
+#endif
+		invlpg(smp_tlb_addr1);
 	PCPU_SET(smp_tlb_done, generation);
 }
 
 void
 invlrng_handler(void)
 {
 	vm_offset_t addr, addr2;
 	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
 	generation = smp_tlb_generation;	/* Overlap with serialization */
-	do {
-		invlpg(addr);
-		addr += PAGE_SIZE;
-	} while (addr < addr2);
+#ifdef __i386__
+	if (smp_tlb_pmap == kernel_pmap)
+#endif
+		do {
+			invlpg(addr);
+			addr += PAGE_SIZE;
+		} while (addr < addr2);
 
 	PCPU_SET(smp_tlb_done, generation);
 }
Index: head/sys/x86/x86/mptable.c
===================================================================
--- head/sys/x86/x86/mptable.c	(revision 332488)
+++ head/sys/x86/x86/mptable.c	(revision 332489)
@@ -1,1260 +1,1271 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mptable_force_htt.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/smp.h>
 #ifdef NEW_PCIB
 #include <sys/rman.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 
 #include <dev/pci/pcivar.h>
 #ifdef NEW_PCIB
 #include <dev/pci/pcib_private.h>
 #endif
 #include <x86/apicreg.h>
 #include <x86/mptable.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <machine/md_var.h>
 #ifdef NEW_PCIB
 #include <machine/resource.h>
 #endif
 #include <machine/specialreg.h>
 
 /* string defined by the Intel MP Spec as identifying the MP table */
 #define	MP_SIG			0x5f504d5f	/* _MP_ */
 
 #ifdef __amd64__
 #define	MAX_LAPIC_ID		63	/* Max local APIC ID for HTT fixup */
 #else
 #define	MAX_LAPIC_ID		31	/* Max local APIC ID for HTT fixup */
 #endif
 
 #define BIOS_BASE		(0xf0000)
 #define BIOS_SIZE		(0x10000)
 #define BIOS_COUNT		(BIOS_SIZE/4)
 
 typedef	void mptable_entry_handler(u_char *entry, void *arg);
 typedef	void mptable_extended_entry_handler(ext_entry_ptr entry, void *arg);
 
 /* descriptions of MP table entries */
 typedef struct BASETABLE_ENTRY {
 	uint8_t	type;
 	uint8_t	length;
 	uint8_t	name[16];
 }       basetable_entry;
 
 static basetable_entry basetable_entry_types[] =
 {
 	{0, 20, "Processor"},
 	{1, 8, "Bus"},
 	{2, 8, "I/O APIC"},
 	{3, 8, "I/O INT"},
 	{4, 8, "Local INT"}
 };
 
 typedef struct BUSDATA {
 	u_char  bus_id;
 	enum busTypes bus_type;
 }       bus_datum;
 
 typedef struct INTDATA {
 	u_char  int_type;
 	u_short int_flags;
 	u_char  src_bus_id;
 	u_char  src_bus_irq;
 	u_char  dst_apic_id;
 	u_char  dst_apic_int;
 	u_char	int_vector;
 }       io_int, local_int;
 
 typedef struct BUSTYPENAME {
 	u_char  type;
 	char    name[7];
 }       bus_type_name;
 
 /* From MP spec v1.4, table 4-8. */
 static bus_type_name bus_type_table[] =
 {
 	{UNKNOWN_BUSTYPE, "CBUS  "},
 	{UNKNOWN_BUSTYPE, "CBUSII"},
 	{EISA, "EISA  "},
 	{UNKNOWN_BUSTYPE, "FUTURE"},
 	{UNKNOWN_BUSTYPE, "INTERN"},
 	{ISA, "ISA   "},
 	{UNKNOWN_BUSTYPE, "MBI   "},
 	{UNKNOWN_BUSTYPE, "MBII  "},
 	{MCA, "MCA   "},
 	{UNKNOWN_BUSTYPE, "MPI   "},
 	{UNKNOWN_BUSTYPE, "MPSA  "},
 	{UNKNOWN_BUSTYPE, "NUBUS "},
 	{PCI, "PCI   "},
 	{UNKNOWN_BUSTYPE, "PCMCIA"},
 	{UNKNOWN_BUSTYPE, "TC    "},
 	{UNKNOWN_BUSTYPE, "VL    "},
 	{UNKNOWN_BUSTYPE, "VME   "},
 	{UNKNOWN_BUSTYPE, "XPRESS"}
 };
 
 /* From MP spec v1.4, table 5-1. */
 static int default_data[7][5] =
 {
 /*   nbus, id0, type0, id1, type1 */
 	{1, 0, ISA, 255, NOBUS},
 	{1, 0, EISA, 255, NOBUS},
 	{1, 0, EISA, 255, NOBUS},
 	{1, 0, MCA, 255, NOBUS},
 	{2, 0, ISA, 1, PCI},
 	{2, 0, EISA, 1, PCI},
 	{2, 0, MCA, 1, PCI}
 };
 
 struct pci_probe_table_args {
 	u_char bus;
 	u_char found;
 };
 
 struct pci_route_interrupt_args {
 	u_char bus;		/* Source bus. */
 	u_char irq;		/* Source slot:pin. */
 	int vector;		/* Return value. */
 };
 
 static mpfps_t mpfps;
 static mpcth_t mpct;
 static ext_entry_ptr mpet;
 static void *ioapics[IOAPIC_MAX_ID + 1];
 static bus_datum *busses;
 static int mptable_nioapics, mptable_nbusses, mptable_maxbusid;
 static int pci0 = -1;
 
 static MALLOC_DEFINE(M_MPTABLE, "mptable", "MP Table Items");
 
 static enum intr_polarity conforming_polarity(u_char src_bus,
 	    u_char src_bus_irq);
 static enum intr_trigger conforming_trigger(u_char src_bus, u_char src_bus_irq);
 static enum intr_polarity intentry_polarity(int_entry_ptr intr);
 static enum intr_trigger intentry_trigger(int_entry_ptr intr);
 static int	lookup_bus_type(char *name);
 static void	mptable_count_items(void);
 static void	mptable_count_items_handler(u_char *entry, void *arg);
 #ifdef MPTABLE_FORCE_HTT
 static void	mptable_hyperthread_fixup(u_int id_mask);
 #endif
 static void	mptable_parse_apics_and_busses(void);
 static void	mptable_parse_apics_and_busses_handler(u_char *entry,
     void *arg);
 static void	mptable_parse_default_config_ints(void);
 static void	mptable_parse_ints(void);
 static void	mptable_parse_ints_handler(u_char *entry, void *arg);
 static void	mptable_parse_io_int(int_entry_ptr intr);
 static void	mptable_parse_local_int(int_entry_ptr intr);
 static void	mptable_pci_probe_table_handler(u_char *entry, void *arg);
 static void	mptable_pci_route_interrupt_handler(u_char *entry, void *arg);
 static void	mptable_pci_setup(void);
 static int	mptable_probe(void);
 static int	mptable_probe_cpus(void);
 static void	mptable_probe_cpus_handler(u_char *entry, void *arg __unused);
 static void	mptable_setup_cpus_handler(u_char *entry, void *arg __unused);
 static void	mptable_register(void *dummy);
 static int	mptable_setup_local(void);
 static int	mptable_setup_io(void);
 #ifdef NEW_PCIB
 static void	mptable_walk_extended_table(
     mptable_extended_entry_handler *handler, void *arg);
 #endif
 static void	mptable_walk_table(mptable_entry_handler *handler, void *arg);
 static int	search_for_sig(u_int32_t target, int count);
 
 static struct apic_enumerator mptable_enumerator = {
 	"MPTable",
 	mptable_probe,
 	mptable_probe_cpus,
 	mptable_setup_local,
 	mptable_setup_io
 };
 
 /*
  * look for the MP spec signature
  */
 
 static int
 search_for_sig(u_int32_t target, int count)
 {
 	int     x;
-	u_int32_t *addr = (u_int32_t *) (KERNBASE + target);
+	u_int32_t *addr;
 
+#ifdef __amd64__
+	addr = (u_int32_t *) (KERNBASE + target);
+#else /* __i386__ */
+	addr = (u_int32_t *) (PMAP_MAP_LOW + target);
+#endif
 	for (x = 0; x < count; x += 4)
 		if (addr[x] == MP_SIG)
 			/* make array index a byte index */
 			return (target + (x * sizeof(u_int32_t)));
 	return (-1);
 }
 
 static int
 lookup_bus_type(char *name)
 {
 	int     x;
 
 	for (x = 0; x < MAX_BUSTYPE; ++x)
 		if (strncmp(bus_type_table[x].name, name, 6) == 0)
 			return (bus_type_table[x].type);
 
 	return (UNKNOWN_BUSTYPE);
 }
 
 /*
  * Look for an Intel MP spec table (ie, SMP capable hardware).
  */
 static int
 mptable_probe(void)
 {
 	int     x;
 	u_long  segment;
 	u_int32_t target;
 
 	/* see if EBDA exists */
-	if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) {
+	if ((segment = (u_long) * (u_short *) (
+#ifdef __amd64__
+	    KERNBASE
+#else /* __i386__ */
+	    PMAP_MAP_LOW
+#endif
+	    + 0x40e)) != 0) {
 		/* search first 1K of EBDA */
 		target = (u_int32_t) (segment << 4);
 		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
 			goto found;
 	} else {
 		/* last 1K of base memory, effective 'top of base' passed in */
 		target = (u_int32_t) ((basemem * 1024) - 0x400);
 		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
 			goto found;
 	}
 
 	/* search the BIOS */
 	target = (u_int32_t) BIOS_BASE;
 	if ((x = search_for_sig(target, BIOS_COUNT)) >= 0)
 		goto found;
 
 	/* nothing found */
 	return (ENXIO);
 
 found:
 	mpfps = (mpfps_t)(KERNBASE + x);
 
 	/* Map in the configuration table if it exists. */
 	if (mpfps->config_type != 0) {
 		if (bootverbose)
 			printf(
 		"MP Table version 1.%d found using Default Configuration %d\n",
 			    mpfps->spec_rev, mpfps->config_type);
 		if (mpfps->config_type != 5 && mpfps->config_type != 6) {
 			printf(
 			"MP Table Default Configuration %d is unsupported\n",
 			    mpfps->config_type);
 			return (ENXIO);
 		}
 		mpct = NULL;
 	} else {
 		if ((uintptr_t)mpfps->pap >= 1024 * 1024) {
 			printf("%s: Unable to map MP Configuration Table\n",
 			    __func__);
 			return (ENXIO);
 		}
 		mpct = (mpcth_t)(KERNBASE + (uintptr_t)mpfps->pap);
 		if (mpct->base_table_length + (uintptr_t)mpfps->pap >=
 		    1024 * 1024) {
 			printf("%s: Unable to map end of MP Config Table\n",
 			    __func__);
 			return (ENXIO);
 		}
 		if (mpct->extended_table_length != 0 &&
 		    mpct->extended_table_length + mpct->base_table_length +
 		    (uintptr_t)mpfps->pap < 1024 * 1024)
 			mpet = (ext_entry_ptr)((char *)mpct +
 			    mpct->base_table_length);
 		if (mpct->signature[0] != 'P' || mpct->signature[1] != 'C' ||
 		    mpct->signature[2] != 'M' || mpct->signature[3] != 'P') {
 			printf("%s: MP Config Table has bad signature: %c%c%c%c\n",
 			    __func__, mpct->signature[0], mpct->signature[1],
 			    mpct->signature[2], mpct->signature[3]);
 			return (ENXIO);
 		}
 		if (bootverbose)
 			printf(
 			"MP Configuration Table version 1.%d found at %p\n",
 			    mpct->spec_rev, mpct);
 	}
 
 	return (-100);
 }
 
 /*
  * Run through the MP table enumerating CPUs.
  */
 static int
 mptable_probe_cpus(void)
 {
 	u_int cpu_mask;
 
 	/* Is this a pre-defined config? */
 	if (mpfps->config_type != 0) {
 #ifdef SMP
 		mp_ncpus = 2;
 		mp_maxid = 1;
 #endif
 		max_apic_id = 1;
 	} else {
 		mptable_walk_table(mptable_probe_cpus_handler, &cpu_mask);
 	}
 	return (0);
 }
 
 /*
  * Initialize the local APIC on the BSP.
  */
 static int
 mptable_setup_local(void)
 {
 	vm_paddr_t addr;
 	u_int cpu_mask;
 
 	/* Is this a pre-defined config? */
 	printf("MPTable: <");
 	if (mpfps->config_type != 0) {
 		lapic_create(0, 1);
 		lapic_create(1, 0);
 		addr = DEFAULT_APIC_BASE;
 		printf("Default Configuration %d", mpfps->config_type);
 
 	} else {
 		cpu_mask = 0;
 		mptable_walk_table(mptable_setup_cpus_handler, &cpu_mask);
 #ifdef MPTABLE_FORCE_HTT
 		mptable_hyperthread_fixup(cpu_mask);
 #endif
 		addr = mpct->apic_address;
 		printf("%.*s %.*s", (int)sizeof(mpct->oem_id), mpct->oem_id,
 		    (int)sizeof(mpct->product_id), mpct->product_id);
 	}
 	printf(">\n");
 	lapic_init(addr);
 	return (0);
 }
 
 /*
  * Run through the MP table enumerating I/O APICs.
  */
 static int
 mptable_setup_io(void)
 {
 	int i;
 	u_char byte;
 
 	/* First, we count individual items and allocate arrays. */
 	mptable_count_items();
 	busses = malloc((mptable_maxbusid + 1) * sizeof(bus_datum), M_MPTABLE,
 	    M_WAITOK);
 	for (i = 0; i <= mptable_maxbusid; i++)
 		busses[i].bus_type = NOBUS;
 
 	/* Second, we run through adding I/O APIC's and buses. */
 	mptable_parse_apics_and_busses();	
 
 	/* Third, we run through the table tweaking interrupt sources. */
 	mptable_parse_ints();
 
 	/* Fourth, we register all the I/O APIC's. */
 	for (i = 0; i <= IOAPIC_MAX_ID; i++)
 		if (ioapics[i] != NULL)
 			ioapic_register(ioapics[i]);
 
 	/* Fifth, we setup data structures to handle PCI interrupt routing. */
 	mptable_pci_setup();
 
 	/* Finally, we throw the switch to enable the I/O APIC's. */
 	if (mpfps->mpfb2 & MPFB2_IMCR_PRESENT) {
 		outb(0x22, 0x70);	/* select IMCR */
 		byte = inb(0x23);	/* current contents */
 		byte |= 0x01;		/* mask external INTR */
 		outb(0x23, byte);	/* disconnect 8259s/NMI */
 	}
 
 	return (0);
 }
 
 static void
 mptable_register(void *dummy __unused)
 {
 
 	apic_register_enumerator(&mptable_enumerator);
 }
 SYSINIT(mptable_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST, mptable_register,
     NULL);
 
 /*
  * Call the handler routine for each entry in the MP config base table.
  */
 static void
 mptable_walk_table(mptable_entry_handler *handler, void *arg)
 {
 	u_int i;
 	u_char *entry;
 
 	entry = (u_char *)(mpct + 1);
 	for (i = 0; i < mpct->entry_count; i++) {
 		switch (*entry) {
 		case MPCT_ENTRY_PROCESSOR:
 		case MPCT_ENTRY_IOAPIC:
 		case MPCT_ENTRY_BUS:
 		case MPCT_ENTRY_INT:
 		case MPCT_ENTRY_LOCAL_INT:
 			break;
 		default:
 			panic("%s: Unknown MP Config Entry %d\n", __func__,
 			    (int)*entry);
 		}
 		handler(entry, arg);
 		entry += basetable_entry_types[*entry].length;
 	}
 }
 
 #ifdef NEW_PCIB
 /*
  * Call the handler routine for each entry in the MP config extended
  * table.
  */
 static void
 mptable_walk_extended_table(mptable_extended_entry_handler *handler, void *arg)
 {
 	ext_entry_ptr end, entry;
 
 	if (mpet == NULL)
 		return;
 	entry = mpet;
 	end = (ext_entry_ptr)((char *)mpet + mpct->extended_table_length);
 	while (entry < end) {
 		handler(entry, arg);
 		entry = (ext_entry_ptr)((char *)entry + entry->length);
 	}
 }
 #endif
 
 static void
 mptable_probe_cpus_handler(u_char *entry, void *arg)
 {
 	proc_entry_ptr proc;
 
 	switch (*entry) {
 	case MPCT_ENTRY_PROCESSOR:
 		proc = (proc_entry_ptr)entry;
 		if (proc->cpu_flags & PROCENTRY_FLAG_EN &&
 		    proc->apic_id < MAX_LAPIC_ID && mp_ncpus < MAXCPU) {
 #ifdef SMP
 			mp_ncpus++;
 			mp_maxid = mp_ncpus - 1;
 #endif
 			max_apic_id = max(max_apic_id, proc->apic_id);
 		}
 		break;
 	}
 }
 
 
 static void
 mptable_setup_cpus_handler(u_char *entry, void *arg)
 {
 	proc_entry_ptr proc;
 	u_int *cpu_mask;
 
 	switch (*entry) {
 	case MPCT_ENTRY_PROCESSOR:
 		proc = (proc_entry_ptr)entry;
 		if (proc->cpu_flags & PROCENTRY_FLAG_EN) {
 			lapic_create(proc->apic_id, proc->cpu_flags &
 			    PROCENTRY_FLAG_BP);
 			if (proc->apic_id < MAX_LAPIC_ID) {
 				cpu_mask = (u_int *)arg;
 				*cpu_mask |= (1ul << proc->apic_id);
 			}
 		}
 		break;
 	}
 }
 
 static void
 mptable_count_items_handler(u_char *entry, void *arg __unused)
 {
 	io_apic_entry_ptr apic;
 	bus_entry_ptr bus;
 
 	switch (*entry) {
 	case MPCT_ENTRY_BUS:
 		bus = (bus_entry_ptr)entry;
 		mptable_nbusses++;
 		if (bus->bus_id > mptable_maxbusid)
 			mptable_maxbusid = bus->bus_id;
 		break;
 	case MPCT_ENTRY_IOAPIC:
 		apic = (io_apic_entry_ptr)entry;
 		if (apic->apic_flags & IOAPICENTRY_FLAG_EN)
 			mptable_nioapics++;
 		break;
 	}
 }
 
 /*
  * Count items in the table.
  */
 static void
 mptable_count_items(void)
 {
 
 	/* Is this a pre-defined config? */
 	if (mpfps->config_type != 0) {
 		mptable_nioapics = 1;
 		switch (mpfps->config_type) {
 		case 1:
 		case 2:
 		case 3:
 		case 4:
 			mptable_nbusses = 1;
 			break;
 		case 5:
 		case 6:
 		case 7:
 			mptable_nbusses = 2;
 			break;
 		default:
 			panic("Unknown pre-defined MP Table config type %d",
 			    mpfps->config_type);
 		}
 		mptable_maxbusid = mptable_nbusses - 1;
 	} else
 		mptable_walk_table(mptable_count_items_handler, NULL);
 }
 
 /*
  * Add a bus or I/O APIC from an entry in the table.
  */
 static void
 mptable_parse_apics_and_busses_handler(u_char *entry, void *arg __unused)
 {
 	io_apic_entry_ptr apic;
 	bus_entry_ptr bus;
 	enum busTypes bus_type;
 	int i;
 
 
 	switch (*entry) {
 	case MPCT_ENTRY_BUS:
 		bus = (bus_entry_ptr)entry;
 		bus_type = lookup_bus_type(bus->bus_type);
 		if (bus_type == UNKNOWN_BUSTYPE) {
 			printf("MPTable: Unknown bus %d type \"", bus->bus_id);
 			for (i = 0; i < 6; i++)
 				printf("%c", bus->bus_type[i]);
 			printf("\"\n");
 		}
 		busses[bus->bus_id].bus_id = bus->bus_id;
 		busses[bus->bus_id].bus_type = bus_type;
 		break;
 	case MPCT_ENTRY_IOAPIC:
 		apic = (io_apic_entry_ptr)entry;
 		if (!(apic->apic_flags & IOAPICENTRY_FLAG_EN))
 			break;
 		if (apic->apic_id > IOAPIC_MAX_ID)
 			panic("%s: I/O APIC ID %d too high", __func__,
 			    apic->apic_id);
 		if (ioapics[apic->apic_id] != NULL)
 			panic("%s: Double APIC ID %d", __func__,
 			    apic->apic_id);
 		ioapics[apic->apic_id] = ioapic_create(apic->apic_address,
 		    apic->apic_id, -1);
 		break;
 	default:
 		break;
 	}
 }
 
 /*
  * Enumerate I/O APIC's and buses.
  */
 static void
 mptable_parse_apics_and_busses(void)
 {
 
 	/* Is this a pre-defined config? */
 	if (mpfps->config_type != 0) {
 		ioapics[2] = ioapic_create(DEFAULT_IO_APIC_BASE, 2, 0);
 		busses[0].bus_id = 0;
 		busses[0].bus_type = default_data[mpfps->config_type - 1][2];
 		if (mptable_nbusses > 1) {
 			busses[1].bus_id = 1;
 			busses[1].bus_type =
 			    default_data[mpfps->config_type - 1][4];
 		}
 	} else
 		mptable_walk_table(mptable_parse_apics_and_busses_handler,
 		    NULL);
 }
 
 /*
  * Determine conforming polarity for a given bus type.
  */
 static enum intr_polarity
 conforming_polarity(u_char src_bus, u_char src_bus_irq)
 {
 
 	KASSERT(src_bus <= mptable_maxbusid, ("bus id %d too large", src_bus));
 	switch (busses[src_bus].bus_type) {
 	case ISA:
 	case EISA:
 		return (INTR_POLARITY_HIGH);
 	case PCI:
 		return (INTR_POLARITY_LOW);
 	default:
 		panic("%s: unknown bus type %d", __func__,
 		    busses[src_bus].bus_type);
 	}
 }
 
 /*
  * Determine conforming trigger for a given bus type.
  */
 static enum intr_trigger
 conforming_trigger(u_char src_bus, u_char src_bus_irq)
 {
 
 	KASSERT(src_bus <= mptable_maxbusid, ("bus id %d too large", src_bus));
 	switch (busses[src_bus].bus_type) {
 	case ISA:
 		if (elcr_found)
 			return (elcr_read_trigger(src_bus_irq));
 		else
 			return (INTR_TRIGGER_EDGE);
 	case PCI:
 		return (INTR_TRIGGER_LEVEL);
 
 	case EISA:
 		KASSERT(src_bus_irq < 16, ("Invalid EISA IRQ %d", src_bus_irq));
 		KASSERT(elcr_found, ("Missing ELCR"));
 		return (elcr_read_trigger(src_bus_irq));
 
 	default:
 		panic("%s: unknown bus type %d", __func__,
 		    busses[src_bus].bus_type);
 	}
 }
 
 static enum intr_polarity
 intentry_polarity(int_entry_ptr intr)
 {
 
 	switch (intr->int_flags & INTENTRY_FLAGS_POLARITY) {
 	case INTENTRY_FLAGS_POLARITY_CONFORM:
 		return (conforming_polarity(intr->src_bus_id,
 			    intr->src_bus_irq));
 	case INTENTRY_FLAGS_POLARITY_ACTIVEHI:
 		return (INTR_POLARITY_HIGH);
 	case INTENTRY_FLAGS_POLARITY_ACTIVELO:
 		return (INTR_POLARITY_LOW);
 	default:
 		panic("Bogus interrupt flags");
 	}
 }
 
 static enum intr_trigger
 intentry_trigger(int_entry_ptr intr)
 {
 
 	switch (intr->int_flags & INTENTRY_FLAGS_TRIGGER) {
 	case INTENTRY_FLAGS_TRIGGER_CONFORM:
 		return (conforming_trigger(intr->src_bus_id,
 			    intr->src_bus_irq));
 	case INTENTRY_FLAGS_TRIGGER_EDGE:
 		return (INTR_TRIGGER_EDGE);
 	case INTENTRY_FLAGS_TRIGGER_LEVEL:
 		return (INTR_TRIGGER_LEVEL);
 	default:
 		panic("Bogus interrupt flags");
 	}
 }
 
 /*
  * Parse an interrupt entry for an I/O interrupt routed to a pin on an I/O APIC.
  */
 static void
 mptable_parse_io_int(int_entry_ptr intr)
 {
 	void *ioapic;
 	u_int pin, apic_id;
 
 	apic_id = intr->dst_apic_id;
 	if (intr->dst_apic_id == 0xff) {
 		/*
 		 * An APIC ID of 0xff means that the interrupt is connected
 		 * to the specified pin on all I/O APICs in the system.  If
 		 * there is only one I/O APIC, then use that APIC to route
 		 * the interrupts.  If there is more than one I/O APIC, then
 		 * punt.
 		 */
 		if (mptable_nioapics == 1) {
 			apic_id = 0;
 			while (ioapics[apic_id] == NULL)
 				apic_id++;
 		} else {
 			printf(
 			"MPTable: Ignoring global interrupt entry for pin %d\n",
 			    intr->dst_apic_int);
 			return;
 		}
 	}
 	if (apic_id > IOAPIC_MAX_ID) {
 		printf("MPTable: Ignoring interrupt entry for ioapic%d\n",
 		    intr->dst_apic_id);
 		return;
 	}
 	ioapic = ioapics[apic_id];
 	if (ioapic == NULL) {
 		printf(
 	"MPTable: Ignoring interrupt entry for missing ioapic%d\n",
 		    apic_id);
 		return;
 	}
 	pin = intr->dst_apic_int;
 	switch (intr->int_type) {
 	case INTENTRY_TYPE_INT:
 		switch (busses[intr->src_bus_id].bus_type) {
 		case NOBUS:
 			panic("interrupt from missing bus");
 		case ISA:
 		case EISA:
 			if (busses[intr->src_bus_id].bus_type == ISA)
 				ioapic_set_bus(ioapic, pin, APIC_BUS_ISA);
 			else
 				ioapic_set_bus(ioapic, pin, APIC_BUS_EISA);
 			if (intr->src_bus_irq == pin)
 				break;
 			ioapic_remap_vector(ioapic, pin, intr->src_bus_irq);
 			if (ioapic_get_vector(ioapic, intr->src_bus_irq) ==
 			    intr->src_bus_irq)
 				ioapic_disable_pin(ioapic, intr->src_bus_irq);
 			break;
 		case PCI:
 			ioapic_set_bus(ioapic, pin, APIC_BUS_PCI);
 			break;
 		default:
 			ioapic_set_bus(ioapic, pin, APIC_BUS_UNKNOWN);
 			break;
 		}
 		break;
 	case INTENTRY_TYPE_NMI:
 		ioapic_set_nmi(ioapic, pin);
 		break;
 	case INTENTRY_TYPE_SMI:
 		ioapic_set_smi(ioapic, pin);
 		break;
 	case INTENTRY_TYPE_EXTINT:
 		ioapic_set_extint(ioapic, pin);
 		break;
 	default:
 		panic("%s: invalid interrupt entry type %d\n", __func__,
 		    intr->int_type);
 	}
 	if (intr->int_type == INTENTRY_TYPE_INT ||
 	    (intr->int_flags & INTENTRY_FLAGS_TRIGGER) !=
 	    INTENTRY_FLAGS_TRIGGER_CONFORM)
 		ioapic_set_triggermode(ioapic, pin, intentry_trigger(intr));
 	if (intr->int_type == INTENTRY_TYPE_INT ||
 	    (intr->int_flags & INTENTRY_FLAGS_POLARITY) !=
 	    INTENTRY_FLAGS_POLARITY_CONFORM)
 		ioapic_set_polarity(ioapic, pin, intentry_polarity(intr));
 }
 
 /*
  * Parse an interrupt entry for a local APIC LVT pin.
  */
 static void
 mptable_parse_local_int(int_entry_ptr intr)
 {
 	u_int apic_id, pin;
 
 	if (intr->dst_apic_id == 0xff)
 		apic_id = APIC_ID_ALL;
 	else
 		apic_id = intr->dst_apic_id;
 	if (intr->dst_apic_int == 0)
 		pin = APIC_LVT_LINT0;
 	else
 		pin = APIC_LVT_LINT1;
 	switch (intr->int_type) {
 	case INTENTRY_TYPE_INT:
 #if 1
 		printf(
 	"MPTable: Ignoring vectored local interrupt for LINTIN%d vector %d\n",
 		    intr->dst_apic_int, intr->src_bus_irq);
 		return;
 #else
 		lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_FIXED);
 		break;
 #endif
 	case INTENTRY_TYPE_NMI:
 		lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_NMI);
 		break;
 	case INTENTRY_TYPE_SMI:
 		lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_SMI);
 		break;
 	case INTENTRY_TYPE_EXTINT:
 		lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_EXTINT);
 		break;
 	default:
 		panic("%s: invalid interrupt entry type %d\n", __func__,
 		    intr->int_type);
 	}
 	if ((intr->int_flags & INTENTRY_FLAGS_TRIGGER) !=
 	    INTENTRY_FLAGS_TRIGGER_CONFORM)
 		lapic_set_lvt_triggermode(apic_id, pin,
 		    intentry_trigger(intr));
 	if ((intr->int_flags & INTENTRY_FLAGS_POLARITY) !=
 	    INTENTRY_FLAGS_POLARITY_CONFORM)
 		lapic_set_lvt_polarity(apic_id, pin, intentry_polarity(intr));
 }
 
 /*
  * Parse interrupt entries.
  */
 static void
 mptable_parse_ints_handler(u_char *entry, void *arg __unused)
 {
 	int_entry_ptr intr;
 
 	intr = (int_entry_ptr)entry;
 	switch (*entry) {
 	case MPCT_ENTRY_INT:
 		mptable_parse_io_int(intr);
 		break;
 	case MPCT_ENTRY_LOCAL_INT:
 		mptable_parse_local_int(intr);
 		break;
 	}
 }
 
 /*
  * Configure interrupt pins for a default configuration.  For details see
  * Table 5-2 in Section 5 of the MP Table specification.
  */
 static void
 mptable_parse_default_config_ints(void)
 {
 	struct INTENTRY entry;
 	int pin;
 
 	/*
 	 * All default configs route IRQs from bus 0 to the first 16 pins
 	 * of the first I/O APIC with an APIC ID of 2.
 	 */
 	entry.type = MPCT_ENTRY_INT;
 	entry.int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
 	    INTENTRY_FLAGS_TRIGGER_CONFORM;
 	entry.src_bus_id = 0;
 	entry.dst_apic_id = 2;
 
 	/* Run through all 16 pins. */
 	for (pin = 0; pin < 16; pin++) {
 		entry.dst_apic_int = pin;
 		switch (pin) {
 		case 0:
 			/* Pin 0 is an ExtINT pin. */
 			entry.int_type = INTENTRY_TYPE_EXTINT;
 			break;
 		case 2:
 			/* IRQ 0 is routed to pin 2. */
 			entry.int_type = INTENTRY_TYPE_INT;
 			entry.src_bus_irq = 0;
 			break;
 		default:
 			/* All other pins are identity mapped. */
 			entry.int_type = INTENTRY_TYPE_INT;
 			entry.src_bus_irq = pin;
 			break;
 		}
 		mptable_parse_io_int(&entry);
 	}
 
 	/* Certain configs disable certain pins. */
 	if (mpfps->config_type == 7)
 		ioapic_disable_pin(ioapics[2], 0);
 	if (mpfps->config_type == 2) {
 		ioapic_disable_pin(ioapics[2], 2);
 		ioapic_disable_pin(ioapics[2], 13);
 	}
 }
 
 /*
  * Configure the interrupt pins
  */
 static void
 mptable_parse_ints(void)
 {
 
 	/* Is this a pre-defined config? */
 	if (mpfps->config_type != 0) {
 		/* Configure LINT pins. */
 		lapic_set_lvt_mode(APIC_ID_ALL, APIC_LVT_LINT0,
 		    APIC_LVT_DM_EXTINT);
 		lapic_set_lvt_mode(APIC_ID_ALL, APIC_LVT_LINT1, APIC_LVT_DM_NMI);
 
 		/* Configure I/O APIC pins. */
 		mptable_parse_default_config_ints();
 	} else
 		mptable_walk_table(mptable_parse_ints_handler, NULL);
 }
 
 #ifdef MPTABLE_FORCE_HTT
 /*
  * Perform a hyperthreading "fix-up" to enumerate any logical CPU's
  * that aren't already listed in the table.
  *
  * XXX: We assume that all of the physical CPUs in the
  * system have the same number of logical CPUs.
  *
  * XXX: We assume that APIC ID's are allocated such that
  * the APIC ID's for a physical processor are aligned
  * with the number of logical CPU's in the processor.
  */
 static void
 mptable_hyperthread_fixup(u_int id_mask)
 {
 	u_int i, id, logical_cpus;
 
 	/* Nothing to do if there is no HTT support. */
 	if ((cpu_feature & CPUID_HTT) == 0)
 		return;
 	logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
 	if (logical_cpus <= 1)
 		return;
 
 	/*
 	 * For each APIC ID of a CPU that is set in the mask,
 	 * scan the other candidate APIC ID's for this
 	 * physical processor.  If any of those ID's are
 	 * already in the table, then kill the fixup.
 	 */
 	for (id = 0; id <= MAX_LAPIC_ID; id++) {
 		if ((id_mask & 1 << id) == 0)
 			continue;
 		/* First, make sure we are on a logical_cpus boundary. */
 		if (id % logical_cpus != 0)
 			return;
 		for (i = id + 1; i < id + logical_cpus; i++)
 			if ((id_mask & 1 << i) != 0)
 				return;
 	}
 
 	/*
 	 * Ok, the ID's checked out, so perform the fixup by
 	 * adding the logical CPUs.
 	 */
 	while ((id = ffs(id_mask)) != 0) {
 		id--;
 		for (i = id + 1; i < id + logical_cpus; i++) {
 			if (bootverbose)
 				printf(
 			"MPTable: Adding logical CPU %d from main CPU %d\n",
 				    i, id);
 			lapic_create(i, 0);
 		}
 		id_mask &= ~(1 << id);
 	}
 }
 #endif /* MPTABLE_FORCE_HTT */
 
 /*
  * Support code for routing PCI interrupts using the MP Table.
  */
 static void
 mptable_pci_setup(void)
 {
 	int i;
 
 	/*
 	 * Find the first pci bus and call it 0.  Panic if pci0 is not
 	 * bus zero and there are multiple PCI buses.
 	 */
 	for (i = 0; i <= mptable_maxbusid; i++)
 		if (busses[i].bus_type == PCI) {
 			if (pci0 == -1)
 				pci0 = i;
 			else if (pci0 != 0)
 				panic(
 		"MPTable contains multiple PCI buses but no PCI bus 0");
 		}
 }
 
 static void
 mptable_pci_probe_table_handler(u_char *entry, void *arg)
 {
 	struct pci_probe_table_args *args;
 	int_entry_ptr intr;
 
 	if (*entry != MPCT_ENTRY_INT)
 		return;
 	intr = (int_entry_ptr)entry;
 	args = (struct pci_probe_table_args *)arg;
 	KASSERT(args->bus <= mptable_maxbusid,
 	    ("bus %d is too big", args->bus));
 	KASSERT(busses[args->bus].bus_type == PCI, ("probing for non-PCI bus"));
 	if (intr->src_bus_id == args->bus)
 		args->found = 1;
 }
 
 int
 mptable_pci_probe_table(int bus)
 {
 	struct pci_probe_table_args args;
 
 	if (bus < 0)
 		return (EINVAL);
 	if (mpct == NULL || pci0 == -1 || pci0 + bus > mptable_maxbusid)
 		return (ENXIO);
 	if (busses[pci0 + bus].bus_type != PCI)
 		return (ENXIO);
 	args.bus = pci0 + bus;
 	args.found = 0;
 	mptable_walk_table(mptable_pci_probe_table_handler, &args);
 	if (args.found == 0)
 		return (ENXIO);
 	return (0);
 }
 
 static void
 mptable_pci_route_interrupt_handler(u_char *entry, void *arg)
 {
 	struct pci_route_interrupt_args *args;
 	int_entry_ptr intr;
 	int vector;
 
 	if (*entry != MPCT_ENTRY_INT)
 		return;
 	intr = (int_entry_ptr)entry;
 	args = (struct pci_route_interrupt_args *)arg;
 	if (intr->src_bus_id != args->bus || intr->src_bus_irq != args->irq)
 		return;
 
 	/* Make sure the APIC maps to a known APIC. */
 	KASSERT(ioapics[intr->dst_apic_id] != NULL,
 	    ("No I/O APIC %d to route interrupt to", intr->dst_apic_id));
 
 	/*
 	 * Look up the vector for this APIC / pin combination.  If we
 	 * have previously matched an entry for this PCI IRQ but it
 	 * has the same vector as this entry, just return.  Otherwise,
 	 * we use the vector for this APIC / pin combination.
 	 */
 	vector = ioapic_get_vector(ioapics[intr->dst_apic_id],
 	    intr->dst_apic_int);
 	if (args->vector == vector)
 		return;
 	KASSERT(args->vector == -1,
 	    ("Multiple IRQs for PCI interrupt %d.%d.INT%c: %d and %d\n",
 	    args->bus, args->irq >> 2, 'A' + (args->irq & 0x3), args->vector,
 	    vector));
 	args->vector = vector;
 }
 
 int
 mptable_pci_route_interrupt(device_t pcib, device_t dev, int pin)
 {
 	struct pci_route_interrupt_args args;
 	int slot;
 
 	/* Like ACPI, pin numbers are 0-3, not 1-4. */
 	pin--;
 	KASSERT(pci0 != -1, ("do not know how to route PCI interrupts"));
 	args.bus = pci_get_bus(dev) + pci0;
 	slot = pci_get_slot(dev);
 
 	/*
 	 * PCI interrupt entries in the MP Table encode both the slot and
 	 * pin into the IRQ with the pin being the two least significant
 	 * bits, the slot being the next five bits, and the most significant
 	 * bit being reserved.
 	 */
 	args.irq = slot << 2 | pin;
 	args.vector = -1;
 	mptable_walk_table(mptable_pci_route_interrupt_handler, &args);
 	if (args.vector < 0) {
 		device_printf(pcib, "unable to route slot %d INT%c\n", slot,
 		    'A' + pin);
 		return (PCI_INVALID_IRQ);
 	}
 	if (bootverbose)
 		device_printf(pcib, "slot %d INT%c routed to irq %d\n", slot,
 		    'A' + pin, args.vector);
 	return (args.vector);
 }
 
 #ifdef NEW_PCIB
 struct host_res_args {
 	struct mptable_hostb_softc *sc;
 	device_t dev;
 	u_char	bus;
 };
 
 /*
  * Initialize a Host-PCI bridge so it can restrict resource allocation
  * requests to the resources it actually decodes according to MP
  * config table extended entries.
  */
 static void
 mptable_host_res_handler(ext_entry_ptr entry, void *arg)
 {
 	struct host_res_args *args;
 	cbasm_entry_ptr cbasm;
 	sas_entry_ptr sas;
 	const char *name;
 	uint64_t start, end;
 	int error, *flagp, flags, type;
 
 	args = arg;
 	switch (entry->type) {
 	case MPCT_EXTENTRY_SAS:
 		sas = (sas_entry_ptr)entry;
 		if (sas->bus_id != args->bus)
 			break;
 		switch (sas->address_type) {
 		case SASENTRY_TYPE_IO:
 			type = SYS_RES_IOPORT;
 			flags = 0;
 			break;
 		case SASENTRY_TYPE_MEMORY:
 			type = SYS_RES_MEMORY;
 			flags = 0;
 			break;
 		case SASENTRY_TYPE_PREFETCH:
 			type = SYS_RES_MEMORY;
 			flags = RF_PREFETCHABLE;
 			break;
 		default:
 			printf(
 	    "MPTable: Unknown systems address space type for bus %u: %d\n",
 			    sas->bus_id, sas->address_type);
 			return;
 		}
 		start = sas->address_base;
 		end = sas->address_base + sas->address_length - 1;
 #ifdef __i386__
 		if (start > ULONG_MAX) {
 			device_printf(args->dev,
 			    "Ignoring %d range above 4GB (%#jx-%#jx)\n",
 			    type, (uintmax_t)start, (uintmax_t)end);
 			break;
 		}
 		if (end > ULONG_MAX) {
 			device_printf(args->dev,
 		    "Truncating end of %d range above 4GB (%#jx-%#jx)\n",
 			    type, (uintmax_t)start, (uintmax_t)end);
 			end = ULONG_MAX;
 		}
 #endif
 		error = pcib_host_res_decodes(&args->sc->sc_host_res, type,
 		    start, end, flags);
 		if (error)
 			panic("Failed to manage %d range (%#jx-%#jx): %d",
 			    type, (uintmax_t)start, (uintmax_t)end, error);
 		break;
 	case MPCT_EXTENTRY_CBASM:
 		cbasm = (cbasm_entry_ptr)entry;
 		if (cbasm->bus_id != args->bus)
 			break;
 		switch (cbasm->predefined_range) {
 		case CBASMENTRY_RANGE_ISA_IO:
 			flagp = &args->sc->sc_decodes_isa_io;
 			name = "ISA I/O";
 			break;
 		case CBASMENTRY_RANGE_VGA_IO:
 			flagp = &args->sc->sc_decodes_vga_io;
 			name = "VGA I/O";
 			break;
 		default:
 			printf(
     "MPTable: Unknown compatiblity address space range for bus %u: %d\n",
 			    cbasm->bus_id, cbasm->predefined_range);
 			return;
 		}
 		if (*flagp != 0)
 			printf(
 		    "MPTable: Duplicate compatibility %s range for bus %u\n",
 			    name, cbasm->bus_id);
 		switch (cbasm->address_mod) {
 		case CBASMENTRY_ADDRESS_MOD_ADD:
 			*flagp = 1;
 			if (bootverbose)
 				device_printf(args->dev, "decoding %s ports\n",
 				    name);
 			break;
 		case CBASMENTRY_ADDRESS_MOD_SUBTRACT:
 			*flagp = -1;
 			if (bootverbose)
 				device_printf(args->dev,
 				    "not decoding %s ports\n", name);
 			break;
 		default:
 			printf(
 	    "MPTable: Unknown compatibility address space modifier: %u\n",
 			    cbasm->address_mod);
 			break;
 		}
 		break;
 	}
 }
 
 void
 mptable_pci_host_res_init(device_t pcib)
 {
 	struct host_res_args args;
 
 	KASSERT(pci0 != -1, ("do not know how to map PCI bus IDs"));
 	args.bus = pci_get_bus(pcib) + pci0;
 	args.dev = pcib;
 	args.sc = device_get_softc(pcib);
 	if (pcib_host_res_init(pcib, &args.sc->sc_host_res) != 0)
 		panic("failed to init hostb resources");
 	mptable_walk_extended_table(mptable_host_res_handler, &args);
 }
 #endif