diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 33defe79c8b9..e954e8cebbb9 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -1,1102 +1,1101 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1996, by Steve Passe
  * Copyright (c) 2003, by Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 #include <sys/domainset.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cputypes.h>
 #include <machine/cpufunc.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/tss.h>
 #include <x86/ucode.h>
 #include <machine/cpu.h>
 #include <x86/init.h>
 
 #ifdef DEV_ACPI
 #include <contrib/dev/acpica/include/acpi.h>
 #include <dev/acpica/acpivar.h>
 #endif
 
 #define WARMBOOT_TARGET		0
 #define WARMBOOT_OFF		(KERNBASE + 0x0467)
 #define WARMBOOT_SEG		(KERNBASE + 0x0469)
 
 #define CMOS_REG		(0x70)
 #define CMOS_DATA		(0x71)
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
 #define GiB(v)			(v ## ULL << 30)
 
 #define	AP_BOOTPT_SZ		(PAGE_SIZE * 4)
 
 /* Temporary variables for init_secondary()  */
 static char *doublefault_stack;
 static char *mce_stack;
 static char *nmi_stack;
 static char *dbg_stack;
 void *bootpcpu;
 
 extern u_int mptramp_la57;
 extern u_int mptramp_nx;
 
 /*
  * Local data and functions.
  */
 
 static int start_ap(int apic_id, vm_paddr_t boot_address);
 
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
 void
 cpu_mp_start(void)
 {
 	int i;
 
 	/* Initialize the logical ID to APIC ID table. */
 	for (i = 0; i < MAXCPU; i++) {
 		cpu_apic_ids[i] = -1;
 	}
 
 	/* Install an inter-CPU IPI for cache and TLB invalidations. */
 	setidt(IPI_INVLOP, pti ? IDTVEC(invlop_pti) : IDTVEC(invlop),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
 	setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) :
 	    IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install generic inter-CPU IPI handler */
 	setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) :
 	    IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for CPU stop/restart */
 	setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for CPU suspend/resume */
 	setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an IPI for calling delayed SWI */
 	setidt(IPI_SWI, pti ? IDTVEC(ipi_swi_pti) : IDTVEC(ipi_swi),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
 		boot_cpu_id = PCPU_GET(apic_id);
 		cpu_info[boot_cpu_id].cpu_bsp = 1;
 	} else
 		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
 
 	/* Probe logical/physical core configuration. */
 	topo_probe();
 
 	assign_cpu_ids();
 
 	mptramp_la57 = la57;
 	mptramp_nx = pg_nx != 0;
 	MPASS(kernel_pmap->pm_cr3 < (1UL << 32));
 	mptramp_pagetables = kernel_pmap->pm_cr3;
 
 	/* Start each Application Processor */
 	start_all_aps();
 
 	set_interrupt_apic_ids();
 
 #if defined(DEV_ACPI) && MAXMEMDOM > 1
 	acpi_pxm_set_cpu_locality();
 #endif
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary(void)
 {
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
 	struct user_segment_descriptor *gdt;
 	struct region_descriptor ap_gdt;
 	u_int64_t cr0;
 	int cpu, gsel_tss, x;
 
 	/* Set by the startup code for us to use */
 	cpu = bootAP;
 
 	/* Update microcode before doing anything else. */
 	ucode_load_ap(cpu);
 
 	/* Initialize the PCPU area. */
 	pc = bootpcpu;
 	pcpu_init(pc, cpu, sizeof(struct pcpu));
 	dpcpu_init(dpcpu, cpu);
 	pc->pc_apic_id = cpu_apic_ids[cpu];
 	pc->pc_prvspace = pc;
 	pc->pc_curthread = 0;
 	pc->pc_tssp = &pc->pc_common_tss;
 	pc->pc_rsp0 = 0;
 	pc->pc_pti_rsp0 = (((vm_offset_t)&pc->pc_pti_stack +
 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 	gdt = pc->pc_gdt;
 	pc->pc_tss = (struct system_segment_descriptor *)&gdt[GPROC0_SEL];
 	pc->pc_fs32p = &gdt[GUFS32_SEL];
 	pc->pc_gs32p = &gdt[GUGS32_SEL];
 	pc->pc_ldt = (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL];
 	pc->pc_ucr3_load_mask = PMAP_UCR3_NOMASK;
 	/* See comment in pmap_bootstrap(). */
 	pc->pc_pcid_next = PMAP_PCID_KERN + 2;
 	pc->pc_pcid_gen = 1;
 
 	pc->pc_smp_tlb_gen = 1;
 
 	/* Init tss */
 	pc->pc_common_tss = __pcpu[0].pc_common_tss;
 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
 	    IOPERM_BITMAP_SIZE;
 	pc->pc_common_tss.tss_rsp0 = 0;
 
 	/* The doublefault stack runs on IST1. */
 	np = ((struct nmi_pcpu *)&doublefault_stack[DBLFAULT_STACK_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist1 = (long)np;
 
 	/* The NMI stack runs on IST2. */
 	np = ((struct nmi_pcpu *)&nmi_stack[NMI_STACK_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist2 = (long)np;
 
 	/* The MC# stack runs on IST3. */
 	np = ((struct nmi_pcpu *)&mce_stack[MCE_STACK_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist3 = (long)np;
 
 	/* The DB# stack runs on IST4. */
 	np = ((struct nmi_pcpu *)&dbg_stack[DBG_STACK_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist4 = (long)np;
 
 	/* Prepare private GDT */
 	gdt_segs[GPROC0_SEL].ssd_base = (long)&pc->pc_common_tss;
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != GPROC0_SEL + 1 &&
 		    x != GUSERLDT_SEL && x != GUSERLDT_SEL + 1)
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 	ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	ap_gdt.rd_base = (u_long)gdt;
 	lgdt(&ap_gdt);			/* does magic intra-segment return */
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (uint64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value */
 	fix_cpuid();
 
 	lidt(&r_idt);
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	/*
 	 * Set to a known state:
 	 * Set by mpboot.s: CR0_PG, CR0_PE
 	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
 	 */
 	cr0 = rcr0();
 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
 	load_cr0(cr0);
 
 	amd64_conf_fast_syscall();
 
 	/* signal our startup to the BSP. */
 	mp_naps++;
 
 	/* Spin until the BSP releases the AP's. */
 	while (atomic_load_acq_int(&aps_ready) == 0)
 		ia32_pause();
 
 	init_secondary_tail();
 }
 
 /*******************************************************************
  * local functions and data
  */
 
 #ifdef NUMA
 static void
 mp_realloc_pcpu(int cpuid, int domain)
 {
 	vm_page_t m;
 	vm_offset_t oa, na;
 
 	oa = (vm_offset_t)&__pcpu[cpuid];
 	if (vm_phys_domain(pmap_kextract(oa)) == domain)
 		return;
 	m = vm_page_alloc_noobj_domain(domain, 0);
 	if (m == NULL)
 		return;
 	na = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	pagecopy((void *)oa, (void *)na);
 	pmap_qenter((vm_offset_t)&__pcpu[cpuid], &m, 1);
 	/* XXX old pcpu page leaked. */
 }
 #endif
 
 /*
  * start each AP in our list
  */
 int
 start_all_aps(void)
 {
 	vm_page_t m_boottramp, m_pml4, m_pdp, m_pd[4];
 	pml5_entry_t old_pml45;
 	pml4_entry_t *v_pml4;
 	pdp_entry_t *v_pdp;
 	pd_entry_t *v_pd;
 	vm_paddr_t boot_address;
 	u_int32_t mpbioswarmvec;
 	int apic_id, cpu, domain, i;
 	u_char mpbiosreason;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
 	MPASS(bootMP_size <= PAGE_SIZE);
-	m_boottramp = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL |
-	    VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ, 1, 0,
+	m_boottramp = vm_page_alloc_noobj_contig(0, 1, 0,
 	    (1ULL << 20), /* Trampoline should be below 1M for real mode */
 	    PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
 	boot_address = VM_PAGE_TO_PHYS(m_boottramp);
 
 	/* Create a transient 1:1 mapping of low 4G */
 	if (la57) {
 		m_pml4 = pmap_page_alloc_below_4g(true);
 		v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
 	} else {
 		v_pml4 = &kernel_pmap->pm_pmltop[0];
 	}
 	m_pdp = pmap_page_alloc_below_4g(true);
 	v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
 	m_pd[0] = pmap_page_alloc_below_4g(false);
 	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0]));
 	for (i = 0; i < NPDEPG; i++)
 		v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A |
 		    X86_PG_M | PG_PS;
 	m_pd[1] = pmap_page_alloc_below_4g(false);
 	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1]));
 	for (i = 0; i < NPDEPG; i++)
 		v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW |
 		    X86_PG_A | X86_PG_M | PG_PS;
 	m_pd[2] = pmap_page_alloc_below_4g(false);
 	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2]));
 	for (i = 0; i < NPDEPG; i++)
 		v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
 		    X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
 	m_pd[3] = pmap_page_alloc_below_4g(false);
 	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3]));
 	for (i = 0; i < NPDEPG; i++)
 		v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
 		    X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
 	v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V |
 	    X86_PG_RW | X86_PG_A | X86_PG_M;
 	v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V |
 	    X86_PG_RW | X86_PG_A | X86_PG_M;
 	v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V |
 	    X86_PG_RW | X86_PG_A | X86_PG_M;
 	v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V |
 	    X86_PG_RW | X86_PG_A | X86_PG_M;
 	old_pml45 = kernel_pmap->pm_pmltop[0];
 	if (la57) {
 		kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) |
 		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 	}
 	v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V |
 	    X86_PG_RW | X86_PG_A | X86_PG_M;
 	pmap_invalidate_all(kernel_pmap);
 
 	/* copy the AP 1st level boot code */
 	bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
 	if (bootverbose)
 		printf("AP boot address %#lx\n", boot_address);
 
 	/* save the current value of the warm-start vector */
 	if (!efi_boot)
 		mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
 	outb(CMOS_REG, BIOS_RESET);
 	mpbiosreason = inb(CMOS_DATA);
 
 	/* setup a vector to our boot code */
 	if (!efi_boot) {
 		*((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET;
 		*((volatile u_short *)WARMBOOT_SEG) = (boot_address >> 4);
 	}
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 
 	/* Relocate pcpu areas to the correct domain. */
 #ifdef NUMA
 	if (vm_ndomains > 1)
 		for (cpu = 1; cpu < mp_ncpus; cpu++) {
 			apic_id = cpu_apic_ids[cpu];
 			domain = acpi_pxm_get_cpu_locality(apic_id);
 			mp_realloc_pcpu(cpu, domain);
 		}
 #endif
 
 	/* start each AP */
 	domain = 0;
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		apic_id = cpu_apic_ids[cpu];
 #ifdef NUMA
 		if (vm_ndomains > 1)
 			domain = acpi_pxm_get_cpu_locality(apic_id);
 #endif
 		/* allocate and set up an idle stack data page */
 		bootstacks[cpu] = (void *)kmem_malloc(kstack_pages * PAGE_SIZE,
 		    M_WAITOK | M_ZERO);
 		doublefault_stack = (char *)kmem_malloc(DBLFAULT_STACK_SIZE,
 		    M_WAITOK | M_ZERO);
 		mce_stack = (char *)kmem_malloc(MCE_STACK_SIZE,
 		    M_WAITOK | M_ZERO);
 		nmi_stack = (char *)kmem_malloc_domainset(
 		    DOMAINSET_PREF(domain), NMI_STACK_SIZE, M_WAITOK | M_ZERO);
 		dbg_stack = (char *)kmem_malloc_domainset(
 		    DOMAINSET_PREF(domain), DBG_STACK_SIZE, M_WAITOK | M_ZERO);
 		dpcpu = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain),
 		    DPCPU_SIZE, M_WAITOK | M_ZERO);
 
 		bootpcpu = &__pcpu[cpu];
 		bootSTK = (char *)bootstacks[cpu] +
 		    kstack_pages * PAGE_SIZE - 8;
 		bootAP = cpu;
 
 		/* attempt to start the Application Processor */
 		if (!start_ap(apic_id, boot_address)) {
 			/* restore the warmstart vector */
 			if (!efi_boot)
 				*(u_int32_t *)WARMBOOT_OFF = mpbioswarmvec;
 			panic("AP #%d (PHY# %d) failed!", cpu, apic_id);
 		}
 
 		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
 	}
 
 	/* restore the warmstart vector */
 	if (!efi_boot)
 		*(u_int32_t *)WARMBOOT_OFF = mpbioswarmvec;
 
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 
 	/* Destroy transient 1:1 mapping */
 	kernel_pmap->pm_pmltop[0] = old_pml45;
 	invlpg(0);
 	if (la57)
 		vm_page_free(m_pml4);
 	vm_page_free(m_pd[3]);
 	vm_page_free(m_pd[2]);
 	vm_page_free(m_pd[1]);
 	vm_page_free(m_pd[0]);
 	vm_page_free(m_pdp);
 	vm_page_free(m_boottramp);
 
 	/* number of APs actually started */
 	return (mp_naps);
 }
 
 /*
  * This function starts the AP (application processor) identified
  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
  * to accomplish this.  This is necessary because of the nuances
  * of the different hardware we might encounter.  It isn't pretty,
  * but it seems to work.
  */
 static int
 start_ap(int apic_id, vm_paddr_t boot_address)
 {
 	int vector, ms;
 	int cpus;
 
 	/* calculate the vector */
 	vector = (boot_address >> 12) & 0xff;
 
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
 	ipi_startup(apic_id, vector);
 
 	/* Wait up to 5 seconds for it to start. */
 	for (ms = 0; ms < 5000; ms++) {
 		if (mp_naps > cpus)
 			return 1;	/* return SUCCESS */
 		DELAY(1000);
 	}
 	return 0;		/* return FAILURE */
 }
 
 /*
  * Flush the TLB on other CPU's
  */
 
 /*
  * Invalidation request.  PCPU pc_smp_tlb_op uses u_int instead of the
  * enum to avoid both namespace and ABI issues (with enums).
  */
 enum invl_op_codes {
       INVL_OP_TLB		= 1,
       INVL_OP_TLB_INVPCID	= 2,
       INVL_OP_TLB_INVPCID_PTI	= 3,
       INVL_OP_TLB_PCID		= 4,
       INVL_OP_PGRNG		= 5,
       INVL_OP_PGRNG_INVPCID	= 6,
       INVL_OP_PGRNG_PCID	= 7,
       INVL_OP_PG		= 8,
       INVL_OP_PG_INVPCID	= 9,
       INVL_OP_PG_PCID		= 10,
       INVL_OP_CACHE		= 11,
 };
 
 /*
  * These variables are initialized at startup to reflect how each of
  * the different kinds of invalidations should be performed on the
  * current machine and environment.
  */
 static enum invl_op_codes invl_op_tlb;
 static enum invl_op_codes invl_op_pgrng;
 static enum invl_op_codes invl_op_pg;
 
 /*
  * Scoreboard of IPI completion notifications from target to IPI initiator.
  *
  * Each CPU can initiate shootdown IPI independently from other CPUs.
  * Initiator enters critical section, then fills its local PCPU
  * shootdown info (pc_smp_tlb_ vars), then clears scoreboard generation
  * at location (cpu, my_cpuid) for each target cpu.  After that IPI is
  * sent to all targets which scan for zeroed scoreboard generation
  * words.  Upon finding such word the shootdown data is read from
  * corresponding cpu's pcpu, and generation is set.  Meantime initiator
  * loops waiting for all zeroed generations in scoreboard to update.
  */
 static uint32_t *invl_scoreboard;
 
 static void
 invl_scoreboard_init(void *arg __unused)
 {
 	u_int i;
 
 	invl_scoreboard = malloc(sizeof(uint32_t) * (mp_maxid + 1) *
 	    (mp_maxid + 1), M_DEVBUF, M_WAITOK);
 	for (i = 0; i < (mp_maxid + 1) * (mp_maxid + 1); i++)
 		invl_scoreboard[i] = 1;
 
 	if (pmap_pcid_enabled) {
 		if (invpcid_works) {
 			if (pti)
 				invl_op_tlb = INVL_OP_TLB_INVPCID_PTI;
 			else
 				invl_op_tlb = INVL_OP_TLB_INVPCID;
 			invl_op_pgrng = INVL_OP_PGRNG_INVPCID;
 			invl_op_pg = INVL_OP_PG_INVPCID;
 		} else {
 			invl_op_tlb = INVL_OP_TLB_PCID;
 			invl_op_pgrng = INVL_OP_PGRNG_PCID;
 			invl_op_pg = INVL_OP_PG_PCID;
 		}
 	} else {
 		invl_op_tlb = INVL_OP_TLB;
 		invl_op_pgrng = INVL_OP_PGRNG;
 		invl_op_pg = INVL_OP_PG;
 	}
 }
 SYSINIT(invl_ops, SI_SUB_SMP, SI_ORDER_FIRST, invl_scoreboard_init, NULL);
 
 static uint32_t *
 invl_scoreboard_getcpu(u_int cpu)
 {
 	return (invl_scoreboard + cpu * (mp_maxid + 1));
 }
 
 static uint32_t *
 invl_scoreboard_slot(u_int cpu)
 {
 	return (invl_scoreboard_getcpu(cpu) + PCPU_GET(cpuid));
 }
 
 /*
  * Used by the pmap to request cache or TLB invalidation on local and
  * remote processors.  Mask provides the set of remote CPUs that are
  * to be signalled with the invalidation IPI.  As an optimization, the
  * curcpu_cb callback is invoked on the calling CPU in a critical
  * section while waiting for the remote CPUs to complete the operation.
  *
  * The callback function is called unconditionally on the caller's
  * underlying processor, even when this processor is not set in the
  * mask.  So, the callback function must be prepared to handle such
  * spurious invocations.
  *
  * Interrupts must be enabled when calling the function with smp
  * started, to avoid deadlock with other IPIs that are protected with
  * smp_ipi_mtx spinlock at the initiator side.
  *
  * Function must be called with the thread pinned, and it unpins on
  * completion.
  */
 static void
 smp_targeted_tlb_shootdown(cpuset_t mask, pmap_t pmap, vm_offset_t addr1,
     vm_offset_t addr2, smp_invl_cb_t curcpu_cb, enum invl_op_codes op)
 {
 	cpuset_t other_cpus;
 	uint32_t generation, *p_cpudone;
 	int cpu;
 	bool is_all;
 
 	/*
 	 * It is not necessary to signal other CPUs while booting or
 	 * when in the debugger.
 	 */
 	if (kdb_active || KERNEL_PANICKED() || !smp_started)
 		goto local_cb;
 
 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 
 	/*
 	 * Check for other cpus.  Return if none.
 	 */
 	is_all = !CPU_CMP(&mask, &all_cpus);
 	CPU_CLR(PCPU_GET(cpuid), &mask);
 	if (CPU_EMPTY(&mask))
 		goto local_cb;
 
 	/*
 	 * Initiator must have interrupts enabled, which prevents
 	 * non-invalidation IPIs that take smp_ipi_mtx spinlock,
 	 * from deadlocking with us.  On the other hand, preemption
 	 * must be disabled to pin initiator to the instance of the
 	 * pcpu pc_smp_tlb data and scoreboard line.
 	 */
 	KASSERT((read_rflags() & PSL_I) != 0,
 	    ("smp_targeted_tlb_shootdown: interrupts disabled"));
 	critical_enter();
 
 	PCPU_SET(smp_tlb_addr1, addr1);
 	PCPU_SET(smp_tlb_addr2, addr2);
 	PCPU_SET(smp_tlb_pmap, pmap);
 	generation = PCPU_GET(smp_tlb_gen);
 	if (++generation == 0)
 		generation = 1;
 	PCPU_SET(smp_tlb_gen, generation);
 	PCPU_SET(smp_tlb_op, op);
 	/* Fence between filling smp_tlb fields and clearing scoreboard. */
 	atomic_thread_fence_rel();
 
 	CPU_FOREACH_ISSET(cpu, &mask) {
 		KASSERT(*invl_scoreboard_slot(cpu) != 0,
 		    ("IPI scoreboard is zero, initiator %d target %d",
 		    PCPU_GET(cpuid), cpu));
 		*invl_scoreboard_slot(cpu) = 0;
 	}
 
 	/*
 	 * IPI acts as a fence between writing to the scoreboard above
 	 * (zeroing slot) and reading from it below (wait for
 	 * acknowledgment).
 	 */
 	if (is_all) {
 		ipi_all_but_self(IPI_INVLOP);
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 	} else {
 		other_cpus = mask;
 		ipi_selected(mask, IPI_INVLOP);
 	}
 	curcpu_cb(pmap, addr1, addr2);
 	CPU_FOREACH_ISSET(cpu, &other_cpus) {
 		p_cpudone = invl_scoreboard_slot(cpu);
 		while (atomic_load_int(p_cpudone) != generation)
 			ia32_pause();
 	}
 
 	/*
 	 * Unpin before leaving critical section.  If the thread owes
 	 * preemption, this allows scheduler to select thread on any
 	 * CPU from its cpuset.
 	 */
 	sched_unpin();
 	critical_exit();
 
 	return;
 
 local_cb:
 	critical_enter();
 	curcpu_cb(pmap, addr1, addr2);
 	sched_unpin();
 	critical_exit();
 }
 
 void
 smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(mask, pmap, 0, 0, curcpu_cb, invl_op_tlb);
 #ifdef COUNT_XINVLTLB_HITS
 	ipi_global++;
 #endif
 }
 
 void
 smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap,
     smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(mask, pmap, addr, 0, curcpu_cb, invl_op_pg);
 #ifdef COUNT_XINVLTLB_HITS
 	ipi_page++;
 #endif
 }
 
 void
 smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
     pmap_t pmap, smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(mask, pmap, addr1, addr2, curcpu_cb,
 	    invl_op_pgrng);
 #ifdef COUNT_XINVLTLB_HITS
 	ipi_range++;
 	ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
 #endif
 }
 
 void
 smp_cache_flush(smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(all_cpus, NULL, 0, 0, curcpu_cb,
 	    INVL_OP_CACHE);
 }
 
 /*
  * Handlers for TLB related IPIs
  */
 static void
 invltlb_handler(pmap_t smp_tlb_pmap)
 {
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	if (smp_tlb_pmap == kernel_pmap)
 		invltlb_glob();
 	else
 		invltlb();
 }
 
 static void
 invltlb_invpcid_handler(pmap_t smp_tlb_pmap)
 {
 	struct invpcid_descr d;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 	d.pad = 0;
 	d.addr = 0;
 	invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB :
 	    INVPCID_CTX);
 }
 
 static void
 invltlb_invpcid_pti_handler(pmap_t smp_tlb_pmap)
 {
 	struct invpcid_descr d;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 	d.pad = 0;
 	d.addr = 0;
 	if (smp_tlb_pmap == kernel_pmap) {
 		/*
 		 * This invalidation actually needs to clear kernel
 		 * mappings from the TLB in the current pmap, but
 		 * since we were asked for the flush in the kernel
 		 * pmap, achieve it by performing global flush.
 		 */
 		invpcid(&d, INVPCID_CTXGLOB);
 	} else {
 		invpcid(&d, INVPCID_CTX);
 		if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 		    smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3)
 			PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
 	}
 }
 
 static void
 invltlb_pcid_handler(pmap_t smp_tlb_pmap)
 {
 	uint32_t pcid;
   
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	if (smp_tlb_pmap == kernel_pmap) {
 		invltlb_glob();
 	} else {
 		/*
 		 * The current pmap might not be equal to
 		 * smp_tlb_pmap.  The clearing of the pm_gen in
 		 * pmap_invalidate_all() takes care of TLB
 		 * invalidation when switching to the pmap on this
 		 * CPU.
 		 */
 		if (smp_tlb_pmap == PCPU_GET(curpmap)) {
 			pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 			load_cr3(smp_tlb_pmap->pm_cr3 | pcid);
 			if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3)
 				PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
 		}
 	}
 }
 
 static void
 invlpg_handler(vm_offset_t smp_tlb_addr1)
 {
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	invlpg(smp_tlb_addr1);
 }
 
 static void
 invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1)
 {
 	struct invpcid_descr d;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	invlpg(smp_tlb_addr1);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 &&
 	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
 		    PMAP_PCID_USER_PT;
 		d.pad = 0;
 		d.addr = smp_tlb_addr1;
 		invpcid(&d, INVPCID_ADDR);
 	}
 }
 
 static void
 invlpg_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1)
 {
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	invlpg(smp_tlb_addr1);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 &&
 	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 		pmap_pti_pcid_invlpg(ucr3, kcr3, smp_tlb_addr1);
 	}
 }
 
 static void
 invlrng_handler(vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2)
 {
 	vm_offset_t addr, addr2;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
 }
 
 static void
 invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1,
     vm_offset_t smp_tlb_addr2)
 {
 	struct invpcid_descr d;
 	vm_offset_t addr, addr2;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 &&
 	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
 		    PMAP_PCID_USER_PT;
 		d.pad = 0;
 		d.addr = smp_tlb_addr1;
 		do {
 			invpcid(&d, INVPCID_ADDR);
 			d.addr += PAGE_SIZE;
 		} while (d.addr < addr2);
 	}
 }
 
 static void
 invlrng_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1,
     vm_offset_t smp_tlb_addr2)
 {
 	vm_offset_t addr, addr2;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 &&
 	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 		pmap_pti_pcid_invlrng(ucr3, kcr3, smp_tlb_addr1, addr2);
 	}
 }
 
 static void
 invlcache_handler(void)
 {
 #ifdef COUNT_IPIS
 	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 	wbinvd();
 }
 
 static void
 invlop_handler_one_req(enum invl_op_codes smp_tlb_op, pmap_t smp_tlb_pmap,
     vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2)
 {
 	switch (smp_tlb_op) {
 	case INVL_OP_TLB:
 		invltlb_handler(smp_tlb_pmap);
 		break;
 	case INVL_OP_TLB_INVPCID:
 		invltlb_invpcid_handler(smp_tlb_pmap);
 		break;
 	case INVL_OP_TLB_INVPCID_PTI:
 		invltlb_invpcid_pti_handler(smp_tlb_pmap);
 		break;
 	case INVL_OP_TLB_PCID:
 		invltlb_pcid_handler(smp_tlb_pmap);
 		break;
 	case INVL_OP_PGRNG:
 		invlrng_handler(smp_tlb_addr1, smp_tlb_addr2);
 		break;
 	case INVL_OP_PGRNG_INVPCID:
 		invlrng_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1,
 		    smp_tlb_addr2);
 		break;
 	case INVL_OP_PGRNG_PCID:
 		invlrng_pcid_handler(smp_tlb_pmap, smp_tlb_addr1,
 		    smp_tlb_addr2);
 		break;
 	case INVL_OP_PG:
 		invlpg_handler(smp_tlb_addr1);
 		break;
 	case INVL_OP_PG_INVPCID:
 		invlpg_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1);
 		break;
 	case INVL_OP_PG_PCID:
 		invlpg_pcid_handler(smp_tlb_pmap, smp_tlb_addr1);
 		break;
 	case INVL_OP_CACHE:
 		invlcache_handler();
 		break;
 	default:
 		__assert_unreachable();
 		break;
 	}
 }
 
 void
 invlop_handler(void)
 {
 	struct pcpu *initiator_pc;
 	pmap_t smp_tlb_pmap;
 	vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 	u_int initiator_cpu_id;
 	enum invl_op_codes smp_tlb_op;
 	uint32_t *scoreboard, smp_tlb_gen;
 
 	scoreboard = invl_scoreboard_getcpu(PCPU_GET(cpuid));
 	for (;;) {
 		for (initiator_cpu_id = 0; initiator_cpu_id <= mp_maxid;
 		    initiator_cpu_id++) {
 			if (atomic_load_int(&scoreboard[initiator_cpu_id]) == 0)
 				break;
 		}
 		if (initiator_cpu_id > mp_maxid)
 			break;
 		initiator_pc = cpuid_to_pcpu[initiator_cpu_id];
 
 		/*
 		 * This acquire fence and its corresponding release
 		 * fence in smp_targeted_tlb_shootdown() is between
 		 * reading zero scoreboard slot and accessing PCPU of
 		 * initiator for pc_smp_tlb values.
 		 */
 		atomic_thread_fence_acq();
 		smp_tlb_pmap = initiator_pc->pc_smp_tlb_pmap;
 		smp_tlb_addr1 = initiator_pc->pc_smp_tlb_addr1;
 		smp_tlb_addr2 = initiator_pc->pc_smp_tlb_addr2;
 		smp_tlb_op = initiator_pc->pc_smp_tlb_op;
 		smp_tlb_gen = initiator_pc->pc_smp_tlb_gen;
 
 		/*
 		 * Ensure that we do not make our scoreboard
 		 * notification visible to the initiator until the
 		 * pc_smp_tlb values are read.  The corresponding
 		 * fence is implicitly provided by the barrier in the
 		 * IPI send operation before the APIC ICR register
 		 * write.
 		 *
 		 * As an optimization, the request is acknowledged
 		 * before the actual invalidation is performed.  It is
 		 * safe because target CPU cannot return to userspace
 		 * before handler finishes. Only NMI can preempt the
 		 * handler, but NMI would see the kernel handler frame
 		 * and not touch not-invalidated user page table.
 		 */
 		atomic_thread_fence_acq();
 		atomic_store_int(&scoreboard[initiator_cpu_id], smp_tlb_gen);
 
 		invlop_handler_one_req(smp_tlb_op, smp_tlb_pmap, smp_tlb_addr1,
 		    smp_tlb_addr2);
 	}
 }
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 8d6c81a5459b..f8bb384afdaf 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -1,11987 +1,11971 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * Copyright (c) 2014-2020 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Portions of this software were developed by
  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #define	AMD64_NPT_AWARE
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_ddb.h"
 #include "opt_pmap.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/asan.h>
 #include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msan.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rangeset.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/smr.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #ifdef DDB
 #include <sys/kdb.h>
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/vm_dumpset.h>
 #include <vm/uma.h>
 
 #include <machine/asan.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/ifunc.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/msan.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/sysarch.h>
 #include <machine/tss.h>
 
 #ifdef NUMA
 #define	PMAP_MEMDOM	MAXMEMDOM
 #else
 #define	PMAP_MEMDOM	1
 #endif
 
 static __inline boolean_t
 pmap_type_guest(pmap_t pmap)
 {
 
 	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
 }
 
 static __inline boolean_t
 pmap_emulate_ad_bits(pmap_t pmap)
 {
 
 	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
 }
 
 static __inline pt_entry_t
 pmap_valid_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_V;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_EMUL_V;
 		else
 			mask = EPT_PG_READ;
 		break;
 	default:
 		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_rw_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_RW;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_EMUL_RW;
 		else
 			mask = EPT_PG_WRITE;
 		break;
 	default:
 		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static pt_entry_t pg_g;
 
 static __inline pt_entry_t
 pmap_global_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 		mask = pg_g;
 		break;
 	case PT_RVI:
 	case PT_EPT:
 		mask = 0;
 		break;
 	default:
 		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_accessed_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_A;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_READ;
 		else
 			mask = EPT_PG_A;
 		break;
 	default:
 		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_modified_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_M;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_WRITE;
 		else
 			mask = EPT_PG_M;
 		break;
 	default:
 		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_pku_mask_bit(pmap_t pmap)
 {
 
 	return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
 }
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #undef pa_index
 #ifdef NUMA
 #define	pa_index(pa)	({					\
 	KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end,	\
 	    ("address %lx beyond the last segment", (pa)));	\
 	(pa) >> PDRSHIFT;					\
 })
 #define	pa_to_pmdp(pa)	(&pv_table[pa_index(pa)])
 #define	pa_to_pvh(pa)	(&(pa_to_pmdp(pa)->pv_page))
 #define	PHYS_TO_PV_LIST_LOCK(pa)	({			\
 	struct rwlock *_lock;					\
 	if (__predict_false((pa) > pmap_last_pa))		\
 		_lock = &pv_dummy_large.pv_lock;		\
 	else							\
 		_lock = &(pa_to_pmdp(pa)->pv_lock);		\
 	_lock;							\
 })
 #else
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
 #endif
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
 	struct rwlock *_new_lock;			\
 							\
 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
 	if (_new_lock != *_lockp) {			\
 		if (*_lockp != NULL)			\
 			rw_wunlock(*_lockp);		\
 		*_lockp = _new_lock;			\
 		rw_wlock(*_lockp);			\
 	}						\
 } while (0)
 
 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 
 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
 	struct rwlock **_lockp = (lockp);		\
 							\
 	if (*_lockp != NULL) {				\
 		rw_wunlock(*_lockp);			\
 		*_lockp = NULL;				\
 	}						\
 } while (0)
 
 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 
 int nkpt;
 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
     "Number of kernel page table pages allocated on bootup");
 
 static int ndmpdp;
 vm_paddr_t dmaplimit;
 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
 pt_entry_t pg_nx;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "VM/pmap parameters");
 
 static int pg_ps_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pg_ps_enabled, 0, "Are large page mappings enabled?");
 
 int __read_frequently la57 = 0;
 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &la57, 0,
     "5-level paging for host is enabled");
 
 static bool
 pmap_is_la57(pmap_t pmap)
 {
 	if (pmap->pm_type == PT_X86)
 		return (la57);
 	return (false);		/* XXXKIB handle EPT */
 }
 
 #define	PAT_INDEX_SIZE	8
 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
 
 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
 static u_int64_t	KPDPphys;	/* phys addr of kernel level 3 */
 u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
 u_int64_t		KPML5phys;	/* phys addr of kernel level 5,
 					   if supported */
 
 #ifdef KASAN
 static uint64_t		KASANPDPphys;
 #endif
 #ifdef KMSAN
 static uint64_t		KMSANSHADPDPphys;
 static uint64_t		KMSANORIGPDPphys;
 
 /*
  * To support systems with large amounts of memory, it is necessary to extend
  * the maximum size of the direct map.  This could eat into the space reserved
  * for the shadow map.
  */
 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow");
 #endif
 
 static pml4_entry_t	*kernel_pml4;
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 static int		ndmpdpphys;	/* number of DMPDPphys pages */
 
 vm_paddr_t		kernphys;	/* phys addr of start of bootstrap data */
 vm_paddr_t		KERNend;	/* and the end */
 
 /*
  * pmap_mapdev support pre initialization (i.e. console)
  */
 #define	PMAP_PREINIT_MAPPING_COUNT	8
 static struct pmap_preinit_mapping {
 	vm_paddr_t	pa;
 	vm_offset_t	va;
 	vm_size_t	sz;
 	int		mode;
 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 static int pmap_initialized;
 
 /*
  * Data for the pv entry allocation mechanism.
  * Updates to pv_invl_gen are protected by the pv list lock but reads are not.
  */
 #ifdef NUMA
 static __inline int
 pc_to_domain(struct pv_chunk *pc)
 {
 
 	return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
 }
 #else
 static __inline int
 pc_to_domain(struct pv_chunk *pc __unused)
 {
 
 	return (0);
 }
 #endif
 
 struct pv_chunks_list {
 	struct mtx pvc_lock;
 	TAILQ_HEAD(pch, pv_chunk) pvc_list;
 	int active_reclaims;
 } __aligned(CACHE_LINE_SIZE);
 
 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
 
 #ifdef	NUMA
 struct pmap_large_md_page {
 	struct rwlock   pv_lock;
 	struct md_page  pv_page;
 	u_long pv_invl_gen;
 };
 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
 #define pv_dummy pv_dummy_large.pv_page
 __read_mostly static struct pmap_large_md_page *pv_table;
 __read_mostly vm_paddr_t pmap_last_pa;
 #else
 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
 static u_long pv_invl_gen[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 static struct md_page pv_dummy;
 #endif
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1 = NULL;
 caddr_t CADDR1 = 0;
 static vm_offset_t qframe = 0;
 static struct mtx qframe_mtx;
 
 static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
 
 static vmem_t *large_vmem;
 static u_int lm_ents;
 #define	PMAP_ADDRESS_IN_LARGEMAP(va)	((va) >= LARGEMAP_MIN_ADDRESS && \
 	(va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents)
 
 int pmap_pcid_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
 int invpcid_works = 0;
 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
     "Is the invpcid instruction available ?");
 
 int __read_frequently pti = 0;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pti, 0,
     "Page Table Isolation enabled");
 static vm_object_t pti_obj;
 static pml4_entry_t *pti_pml4;
 static vm_pindex_t pti_pg_idx;
 static bool pti_finalized;
 
 struct pmap_pkru_range {
 	struct rs_el	pkru_rs_el;
 	u_int		pkru_keyidx;
 	int		pkru_flags;
 };
 
 static uma_zone_t pmap_pkru_ranges_zone;
 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
 static void *pkru_dup_range(void *ctx, void *data);
 static void pkru_free_range(void *ctx, void *node);
 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
 static void pmap_pkru_deassign_all(pmap_t pmap);
 
 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD,
     &pcid_save_cnt, "Count of saved TLB context on switch");
 
 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
 static struct mtx invl_gen_mtx;
 /* Fake lock object to satisfy turnstiles interface. */
 static struct lock_object invl_gen_ts = {
 	.lo_name = "invlts",
 };
 static struct pmap_invl_gen pmap_invl_gen_head = {
 	.gen = 1,
 	.next = NULL,
 };
 static u_long pmap_invl_gen = 1;
 static int pmap_invl_waiters;
 static struct callout pmap_invl_callout;
 static bool pmap_invl_callout_inited;
 
 #define	PMAP_ASSERT_NOT_IN_DI() \
     KASSERT(pmap_not_in_di(), ("DI already started"))
 
 static bool
 pmap_di_locked(void)
 {
 	int tun;
 
 	if ((cpu_feature2 & CPUID2_CX16) == 0)
 		return (true);
 	tun = 0;
 	TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun);
 	return (tun != 0);
 }
 
 static int
 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS)
 {
 	int locked;
 
 	locked = pmap_di_locked();
 	return (sysctl_handle_int(oidp, &locked, 0, req));
 }
 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN |
     CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "",
     "Locked delayed invalidation");
 
 static bool pmap_not_in_di_l(void);
 static bool pmap_not_in_di_u(void);
 DEFINE_IFUNC(, bool, pmap_not_in_di, (void))
 {
 
 	return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u);
 }
 
 static bool
 pmap_not_in_di_l(void)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	return (invl_gen->gen == 0);
 }
 
 static void
 pmap_thread_init_invl_gen_l(struct thread *td)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &td->td_md.md_invl_gen;
 	invl_gen->gen = 0;
 }
 
 static void
 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen)
 {
 	struct turnstile *ts;
 
 	ts = turnstile_trywait(&invl_gen_ts);
 	if (*m_gen > atomic_load_long(invl_gen))
 		turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
 	else
 		turnstile_cancel(ts);
 }
 
 static void
 pmap_delayed_invl_finish_unblock(u_long new_gen)
 {
 	struct turnstile *ts;
 
 	turnstile_chain_lock(&invl_gen_ts);
 	ts = turnstile_lookup(&invl_gen_ts);
 	if (new_gen != 0)
 		pmap_invl_gen = new_gen;
 	if (ts != NULL) {
 		turnstile_broadcast(ts, TS_SHARED_QUEUE);
 		turnstile_unpend(ts);
 	}
 	turnstile_chain_unlock(&invl_gen_ts);
 }
 
 /*
  * Start a new Delayed Invalidation (DI) block of code, executed by
  * the current thread.  Within a DI block, the current thread may
  * destroy both the page table and PV list entries for a mapping and
  * then release the corresponding PV list lock before ensuring that
  * the mapping is flushed from the TLBs of any processors with the
  * pmap active.
  */
 static void
 pmap_delayed_invl_start_l(void)
 {
 	struct pmap_invl_gen *invl_gen;
 	u_long currgen;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	PMAP_ASSERT_NOT_IN_DI();
 	mtx_lock(&invl_gen_mtx);
 	if (LIST_EMPTY(&pmap_invl_gen_tracker))
 		currgen = pmap_invl_gen;
 	else
 		currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
 	invl_gen->gen = currgen + 1;
 	LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
 	mtx_unlock(&invl_gen_mtx);
 }
 
 /*
  * Finish the DI block, previously started by the current thread.  All
  * required TLB flushes for the pages marked by
  * pmap_delayed_invl_page() must be finished before this function is
  * called.
  *
  * This function works by bumping the global DI generation number to
  * the generation number of the current thread's DI, unless there is a
  * pending DI that started earlier.  In the latter case, bumping the
  * global DI generation number would incorrectly signal that the
  * earlier DI had finished.  Instead, this function bumps the earlier
  * DI's generation number to match the generation number of the
  * current thread's DI.
  */
 static void
 pmap_delayed_invl_finish_l(void)
 {
 	struct pmap_invl_gen *invl_gen, *next;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	KASSERT(invl_gen->gen != 0, ("missed invl_start"));
 	mtx_lock(&invl_gen_mtx);
 	next = LIST_NEXT(invl_gen, link);
 	if (next == NULL)
 		pmap_delayed_invl_finish_unblock(invl_gen->gen);
 	else
 		next->gen = invl_gen->gen;
 	LIST_REMOVE(invl_gen, link);
 	mtx_unlock(&invl_gen_mtx);
 	invl_gen->gen = 0;
 }
 
 static bool
 pmap_not_in_di_u(void)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
 }
 
 static void
 pmap_thread_init_invl_gen_u(struct thread *td)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &td->td_md.md_invl_gen;
 	invl_gen->gen = 0;
 	invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
 }
 
 static bool
 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
 {
 	uint64_t new_high, new_low, old_high, old_low;
 	char res;
 
 	old_low = new_low = 0;
 	old_high = new_high = (uintptr_t)0;
 
 	__asm volatile("lock;cmpxchg16b\t%1"
 	    : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
 	    : "b"(new_low), "c" (new_high)
 	    : "memory", "cc");
 	if (res == 0) {
 		if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
 			return (false);
 		out->gen = old_low;
 		out->next = (void *)old_high;
 	} else {
 		out->gen = new_low;
 		out->next = (void *)new_high;
 	}
 	return (true);
 }
 
 static bool
 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
     struct pmap_invl_gen *new_val)
 {
 	uint64_t new_high, new_low, old_high, old_low;
 	char res;
 
 	new_low = new_val->gen;
 	new_high = (uintptr_t)new_val->next;
 	old_low = old_val->gen;
 	old_high = (uintptr_t)old_val->next;
 
 	__asm volatile("lock;cmpxchg16b\t%1"
 	    : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
 	    : "b"(new_low), "c" (new_high)
 	    : "memory", "cc");
 	return (res);
 }
 
 static COUNTER_U64_DEFINE_EARLY(pv_page_count);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD,
     &pv_page_count, "Current number of allocated pv pages");
 
 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD,
     &user_pt_page_count,
     "Current number of allocated page table pages for userspace");
 
 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD,
     &kernel_pt_page_count,
     "Current number of allocated page table pages for the kernel");
 
 #ifdef PV_STATS
 
 static COUNTER_U64_DEFINE_EARLY(invl_start_restart);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart,
     CTLFLAG_RD, &invl_start_restart,
     "Number of delayed TLB invalidation request restarts");
 
 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
     &invl_finish_restart,
     "Number of delayed TLB invalidation completion restarts");
 
 static int invl_max_qlen;
 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
     &invl_max_qlen, 0,
     "Maximum delayed TLB invalidation request queue length");
 #endif
 
 #define di_delay	locks_delay
 
 static void
 pmap_delayed_invl_start_u(void)
 {
 	struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
 	struct thread *td;
 	struct lock_delay_arg lda;
 	uintptr_t prevl;
 	u_char pri;
 #ifdef PV_STATS
 	int i, ii;
 #endif
 
 	td = curthread;
 	invl_gen = &td->td_md.md_invl_gen;
 	PMAP_ASSERT_NOT_IN_DI();
 	lock_delay_arg_init(&lda, &di_delay);
 	invl_gen->saved_pri = 0;
 	pri = td->td_base_pri;
 	if (pri > PVM) {
 		thread_lock(td);
 		pri = td->td_base_pri;
 		if (pri > PVM) {
 			invl_gen->saved_pri = pri;
 			sched_prio(td, PVM);
 		}
 		thread_unlock(td);
 	}
 again:
 	PV_STAT(i = 0);
 	for (p = &pmap_invl_gen_head;; p = prev.next) {
 		PV_STAT(i++);
 		prevl = (uintptr_t)atomic_load_ptr(&p->next);
 		if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
 			PV_STAT(counter_u64_add(invl_start_restart, 1));
 			lock_delay(&lda);
 			goto again;
 		}
 		if (prevl == 0)
 			break;
 		prev.next = (void *)prevl;
 	}
 #ifdef PV_STATS
 	if ((ii = invl_max_qlen) < i)
 		atomic_cmpset_int(&invl_max_qlen, ii, i);
 #endif
 
 	if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
 		PV_STAT(counter_u64_add(invl_start_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 
 	new_prev.gen = prev.gen;
 	new_prev.next = invl_gen;
 	invl_gen->gen = prev.gen + 1;
 
 	/* Formal fence between store to invl->gen and updating *p. */
 	atomic_thread_fence_rel();
 
 	/*
 	 * After inserting an invl_gen element with invalid bit set,
 	 * this thread blocks any other thread trying to enter the
 	 * delayed invalidation block.  Do not allow to remove us from
 	 * the CPU, because it causes starvation for other threads.
 	 */
 	critical_enter();
 
 	/*
 	 * ABA for *p is not possible there, since p->gen can only
 	 * increase.  So if the *p thread finished its di, then
 	 * started a new one and got inserted into the list at the
 	 * same place, its gen will appear greater than the previously
 	 * read gen.
 	 */
 	if (!pmap_di_store_invl(p, &prev, &new_prev)) {
 		critical_exit();
 		PV_STAT(counter_u64_add(invl_start_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 
 	/*
 	 * There we clear PMAP_INVL_GEN_NEXT_INVALID in
 	 * invl_gen->next, allowing other threads to iterate past us.
 	 * pmap_di_store_invl() provides fence between the generation
 	 * write and the update of next.
 	 */
 	invl_gen->next = NULL;
 	critical_exit();
 }
 
 static bool
 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen,
     struct pmap_invl_gen *p)
 {
 	struct pmap_invl_gen prev, new_prev;
 	u_long mygen;
 
 	/*
 	 * Load invl_gen->gen after setting invl_gen->next
 	 * PMAP_INVL_GEN_NEXT_INVALID.  This prevents larger
 	 * generations to propagate to our invl_gen->gen.  Lock prefix
 	 * in atomic_set_ptr() worked as seq_cst fence.
 	 */
 	mygen = atomic_load_long(&invl_gen->gen);
 
 	if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
 		return (false);
 
 	KASSERT(prev.gen < mygen,
 	    ("invalid di gen sequence %lu %lu", prev.gen, mygen));
 	new_prev.gen = mygen;
 	new_prev.next = (void *)((uintptr_t)invl_gen->next &
 	    ~PMAP_INVL_GEN_NEXT_INVALID);
 
 	/* Formal fence between load of prev and storing update to it. */
 	atomic_thread_fence_rel();
 
 	return (pmap_di_store_invl(p, &prev, &new_prev));
 }
 
 static void
 pmap_delayed_invl_finish_u(void)
 {
 	struct pmap_invl_gen *invl_gen, *p;
 	struct thread *td;
 	struct lock_delay_arg lda;
 	uintptr_t prevl;
 
 	td = curthread;
 	invl_gen = &td->td_md.md_invl_gen;
 	KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
 	KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
 	    ("missed invl_start: INVALID"));
 	lock_delay_arg_init(&lda, &di_delay);
 
 again:
 	for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
 		prevl = (uintptr_t)atomic_load_ptr(&p->next);
 		if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
 			PV_STAT(counter_u64_add(invl_finish_restart, 1));
 			lock_delay(&lda);
 			goto again;
 		}
 		if ((void *)prevl == invl_gen)
 			break;
 	}
 
 	/*
 	 * It is legitimate to not find ourself on the list if a
 	 * thread before us finished its DI and started it again.
 	 */
 	if (__predict_false(p == NULL)) {
 		PV_STAT(counter_u64_add(invl_finish_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 
 	critical_enter();
 	atomic_set_ptr((uintptr_t *)&invl_gen->next,
 	    PMAP_INVL_GEN_NEXT_INVALID);
 	if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) {
 		atomic_clear_ptr((uintptr_t *)&invl_gen->next,
 		    PMAP_INVL_GEN_NEXT_INVALID);
 		critical_exit();
 		PV_STAT(counter_u64_add(invl_finish_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 	critical_exit();
 	if (atomic_load_int(&pmap_invl_waiters) > 0)
 		pmap_delayed_invl_finish_unblock(0);
 	if (invl_gen->saved_pri != 0) {
 		thread_lock(td);
 		sched_prio(td, invl_gen->saved_pri);
 		thread_unlock(td);
 	}
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(di_queue, pmap_di_queue)
 {
 	struct pmap_invl_gen *p, *pn;
 	struct thread *td;
 	uintptr_t nextl;
 	bool first;
 
 	for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
 	    first = false) {
 		nextl = (uintptr_t)atomic_load_ptr(&p->next);
 		pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
 		td = first ? NULL : __containerof(p, struct thread,
 		    td_md.md_invl_gen);
 		db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
 		    (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
 		    td != NULL ? td->td_tid : -1);
 	}
 }
 #endif
 
 #ifdef PV_STATS
 static COUNTER_U64_DEFINE_EARLY(invl_wait);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait,
     CTLFLAG_RD, &invl_wait,
     "Number of times DI invalidation blocked pmap_remove_all/write");
 
 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD,
      &invl_wait_slow, "Number of slow invalidation waits for lockless DI");
 
 #endif
 
 #ifdef NUMA
 static u_long *
 pmap_delayed_invl_genp(vm_page_t m)
 {
 	vm_paddr_t pa;
 	u_long *gen;
 
 	pa = VM_PAGE_TO_PHYS(m);
 	if (__predict_false((pa) > pmap_last_pa))
 		gen = &pv_dummy_large.pv_invl_gen;
 	else
 		gen = &(pa_to_pmdp(pa)->pv_invl_gen);
 
 	return (gen);
 }
 #else
 static u_long *
 pmap_delayed_invl_genp(vm_page_t m)
 {
 
 	return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
 }
 #endif
 
 static void
 pmap_delayed_invl_callout_func(void *arg __unused)
 {
 
 	if (atomic_load_int(&pmap_invl_waiters) == 0)
 		return;
 	pmap_delayed_invl_finish_unblock(0);
 }
 
 static void
 pmap_delayed_invl_callout_init(void *arg __unused)
 {
 
 	if (pmap_di_locked())
 		return;
 	callout_init(&pmap_invl_callout, 1);
 	pmap_invl_callout_inited = true;
 }
 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY,
     pmap_delayed_invl_callout_init, NULL);
 
 /*
  * Ensure that all currently executing DI blocks, that need to flush
  * TLB for the given page m, actually flushed the TLB at the time the
  * function returned.  If the page m has an empty PV list and we call
  * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
  * valid mapping for the page m in either its page table or TLB.
  *
  * This function works by blocking until the global DI generation
  * number catches up with the generation number associated with the
  * given page m and its PV list.  Since this function's callers
  * typically own an object lock and sometimes own a page lock, it
  * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
  * processor.
  */
 static void
 pmap_delayed_invl_wait_l(vm_page_t m)
 {
 	u_long *m_gen;
 #ifdef PV_STATS
 	bool accounted = false;
 #endif
 
 	m_gen = pmap_delayed_invl_genp(m);
 	while (*m_gen > pmap_invl_gen) {
 #ifdef PV_STATS
 		if (!accounted) {
 			counter_u64_add(invl_wait, 1);
 			accounted = true;
 		}
 #endif
 		pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen);
 	}
 }
 
 static void
 pmap_delayed_invl_wait_u(vm_page_t m)
 {
 	u_long *m_gen;
 	struct lock_delay_arg lda;
 	bool fast;
 
 	fast = true;
 	m_gen = pmap_delayed_invl_genp(m);
 	lock_delay_arg_init(&lda, &di_delay);
 	while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
 		if (fast || !pmap_invl_callout_inited) {
 			PV_STAT(counter_u64_add(invl_wait, 1));
 			lock_delay(&lda);
 			fast = false;
 		} else {
 			/*
 			 * The page's invalidation generation number
 			 * is still below the current thread's number.
 			 * Prepare to block so that we do not waste
 			 * CPU cycles or worse, suffer livelock.
 			 *
 			 * Since it is impossible to block without
 			 * racing with pmap_delayed_invl_finish_u(),
 			 * prepare for the race by incrementing
 			 * pmap_invl_waiters and arming a 1-tick
 			 * callout which will unblock us if we lose
 			 * the race.
 			 */
 			atomic_add_int(&pmap_invl_waiters, 1);
 
 			/*
 			 * Re-check the current thread's invalidation
 			 * generation after incrementing
 			 * pmap_invl_waiters, so that there is no race
 			 * with pmap_delayed_invl_finish_u() setting
 			 * the page generation and checking
 			 * pmap_invl_waiters.  The only race allowed
 			 * is for a missed unblock, which is handled
 			 * by the callout.
 			 */
 			if (*m_gen >
 			    atomic_load_long(&pmap_invl_gen_head.gen)) {
 				callout_reset(&pmap_invl_callout, 1,
 				    pmap_delayed_invl_callout_func, NULL);
 				PV_STAT(counter_u64_add(invl_wait_slow, 1));
 				pmap_delayed_invl_wait_block(m_gen,
 				    &pmap_invl_gen_head.gen);
 			}
 			atomic_add_int(&pmap_invl_waiters, -1);
 		}
 	}
 }
 
 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *))
 {
 
 	return (pmap_di_locked() ? pmap_thread_init_invl_gen_l :
 	    pmap_thread_init_invl_gen_u);
 }
 
 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void))
 {
 
 	return (pmap_di_locked() ? pmap_delayed_invl_start_l :
 	    pmap_delayed_invl_start_u);
 }
 
 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void))
 {
 
 	return (pmap_di_locked() ? pmap_delayed_invl_finish_l :
 	    pmap_delayed_invl_finish_u);
 }
 
 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t))
 {
 
 	return (pmap_di_locked() ? pmap_delayed_invl_wait_l :
 	    pmap_delayed_invl_wait_u);
 }
 
 /*
  * Mark the page m's PV list as participating in the current thread's
  * DI block.  Any threads concurrently using m's PV list to remove or
  * restrict all mappings to m will wait for the current thread's DI
  * block to complete before proceeding.
  *
  * The function works by setting the DI generation number for m's PV
  * list to at least the DI generation number of the current thread.
  * This forces a caller of pmap_delayed_invl_wait() to block until
  * current thread calls pmap_delayed_invl_finish().
  */
 static void
 pmap_delayed_invl_page(vm_page_t m)
 {
 	u_long gen, *m_gen;
 
 	rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
 	gen = curthread->td_md.md_invl_gen.gen;
 	if (gen == 0)
 		return;
 	m_gen = pmap_delayed_invl_genp(m);
 	if (*m_gen < gen)
 		*m_gen = gen;
 }
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 /*
  * Internal flags for pmap_enter()'s helper functions.
  */
 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
 
 /*
  * Internal flags for pmap_mapdev_internal() and
  * pmap_change_props_locked().
  */
 #define	MAPDEV_FLUSHCACHE	0x00000001	/* Flush cache after mapping. */
 #define	MAPDEV_SETATTR		0x00000002	/* Modify existing attrs. */
 #define	MAPDEV_ASSERTVALID	0x00000004	/* Assert mapping validity. */
 
 TAILQ_HEAD(pv_chunklist, pv_chunk);
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_chunk_batch(struct pv_chunklist *batch);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 static int	popcnt_pc_map_pq(uint64_t *map);
 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 static void	reserve_pv_entries(pmap_t pmap, int needed,
 		    struct rwlock **lockp);
 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 		    struct rwlock **lockp);
 static bool	pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
 		    u_int flags, struct rwlock **lockp);
 #if VM_NRESERVLEVEL > 0
 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 		    struct rwlock **lockp);
 #endif
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 
 static void	pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
     vm_prot_t prot, int mode, int flags);
 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
     vm_offset_t va, struct rwlock **lockp);
 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
     vm_offset_t va);
 static bool	pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
 		    vm_prot_t prot, struct rwlock **lockp);
 static int	pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
 		    u_int flags, vm_page_t m, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted);
 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
     vm_offset_t eva);
 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
     vm_offset_t eva);
 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
 		    pd_entry_t pde);
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static vm_page_t pmap_large_map_getptp_unlocked(void);
 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va);
 #if VM_NRESERVLEVEL > 0
 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp);
 #endif
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask);
 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
     bool exec);
 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
 static void pmap_pti_wire_pte(void *pte);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     struct spglist *free);
 static bool	pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 		    pd_entry_t *pde, struct spglist *free,
 		    struct rwlock **lockp);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m, struct rwlock **lockp);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
 
 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
 		struct rwlock **lockp);
 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp, vm_offset_t va);
 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp, vm_offset_t va);
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
 		struct rwlock **lockp);
 
 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 
 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int);
 static void pmap_free_pt_page(pmap_t, vm_page_t, bool);
 
 /********************/
 /* Inline functions */
 /********************/
 
 /*
  * Return a non-clipped indexes for a given VA, which are page table
  * pages indexes at the corresponding level.
  */
 static __inline vm_pindex_t
 pmap_pde_pindex(vm_offset_t va)
 {
 	return (va >> PDRSHIFT);
 }
 
 static __inline vm_pindex_t
 pmap_pdpe_pindex(vm_offset_t va)
 {
 	return (NUPDE + (va >> PDPSHIFT));
 }
 
 static __inline vm_pindex_t
 pmap_pml4e_pindex(vm_offset_t va)
 {
 	return (NUPDE + NUPDPE + (va >> PML4SHIFT));
 }
 
 static __inline vm_pindex_t
 pmap_pml5e_pindex(vm_offset_t va)
 {
 	return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT));
 }
 
 static __inline pml4_entry_t *
 pmap_pml5e(pmap_t pmap, vm_offset_t va)
 {
 
 	MPASS(pmap_is_la57(pmap));
 	return (&pmap->pm_pmltop[pmap_pml5e_index(va)]);
 }
 
 static __inline pml4_entry_t *
 pmap_pml5e_u(pmap_t pmap, vm_offset_t va)
 {
 
 	MPASS(pmap_is_la57(pmap));
 	return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]);
 }
 
 static __inline pml4_entry_t *
 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 
 	/* XXX MPASS(pmap_is_la57(pmap); */
 	pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
 	return (&pml4e[pmap_pml4e_index(va)]);
 }
 
 /* Return a pointer to the PML4 slot that corresponds to a VA */
 static __inline pml4_entry_t *
 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 {
 	pml5_entry_t *pml5e;
 	pml4_entry_t *pml4e;
 	pt_entry_t PG_V;
 
 	if (pmap_is_la57(pmap)) {
 		pml5e = pmap_pml5e(pmap, va);
 		PG_V = pmap_valid_bit(pmap);
 		if ((*pml5e & PG_V) == 0)
 			return (NULL);
 		pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
 	} else {
 		pml4e = pmap->pm_pmltop;
 	}
 	return (&pml4e[pmap_pml4e_index(va)]);
 }
 
 static __inline pml4_entry_t *
 pmap_pml4e_u(pmap_t pmap, vm_offset_t va)
 {
 	MPASS(!pmap_is_la57(pmap));
 	return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
 	return (&pdpe[pmap_pdpe_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pml4e = pmap_pml4e(pmap, va);
 	if (pml4e == NULL || (*pml4e & PG_V) == 0)
 		return (NULL);
 	return (pmap_pml4e_to_pdpe(pml4e, va));
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
 {
 	pd_entry_t *pde;
 
 	KASSERT((*pdpe & PG_PS) == 0,
 	    ("%s: pdpe %#lx is a leaf", __func__, *pdpe));
 	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
 	return (&pde[pmap_pde_index(va)]);
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
 		return (NULL);
 	KASSERT((*pdpe & PG_PS) == 0,
 	    ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va));
 	return (pmap_pdpe_to_pde(pdpe, va));
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	KASSERT((*pde & PG_PS) == 0,
 	    ("%s: pde %#lx is a leaf", __func__, *pde));
 	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 	return (&pte[pmap_pte_index(va)]);
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *pde;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return (NULL);
 	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
 		return ((pt_entry_t *)pde);
 	return (pmap_pde_to_pte(pde, va));
 }
 
 static __inline void
 pmap_resident_count_adj(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(pmap->pm_stats.resident_count + count >= 0,
 	    ("pmap %p resident count underflow %ld %d", pmap,
 	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count += count;
 }
 
 static __inline void
 pmap_pt_page_count_adj(pmap_t pmap, int count)
 {
 	if (pmap == kernel_pmap)
 		counter_u64_add(kernel_pt_page_count, count);
 	else {
 		if (pmap != NULL)
 			pmap_resident_count_adj(pmap, count);
 		counter_u64_add(user_pt_page_count, count);
 	}
 }
 
 PMAP_INLINE pt_entry_t *
 vtopte(vm_offset_t va)
 {
 	u_int64_t mask;
 
 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
 
 	if (la57) {
 		mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
 		    NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1);
 		return (P5Tmap + ((va >> PAGE_SHIFT) & mask));
 	} else {
 		mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
 		    NPML4EPGSHIFT)) - 1);
 		return (P4Tmap + ((va >> PAGE_SHIFT) & mask));
 	}
 }
 
 static __inline pd_entry_t *
 vtopde(vm_offset_t va)
 {
 	u_int64_t mask;
 
 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
 
 	if (la57) {
 		mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
 		    NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1);
 		return (P5Dmap + ((va >> PDRSHIFT) & mask));
 	} else {
 		mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
 		    NPML4EPGSHIFT)) - 1);
 		return (P4Dmap + ((va >> PDRSHIFT) & mask));
 	}
 }
 
 static u_int64_t
 allocpages(vm_paddr_t *firstaddr, int n)
 {
 	u_int64_t ret;
 
 	ret = *firstaddr;
 	bzero((void *)ret, n * PAGE_SIZE);
 	*firstaddr += n * PAGE_SIZE;
 	return (ret);
 }
 
 CTASSERT(powerof2(NDMPML4E));
 
 /* number of kernel PDP slots */
 #define	NKPDPE(ptpgs)		howmany(ptpgs, NPDEPG)
 
 static void
 nkpt_init(vm_paddr_t addr)
 {
 	int pt_pages;
 
 #ifdef NKPT
 	pt_pages = NKPT;
 #else
 	pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */
 	pt_pages += NKPDPE(pt_pages);
 
 	/*
 	 * Add some slop beyond the bare minimum required for bootstrapping
 	 * the kernel.
 	 *
 	 * This is quite important when allocating KVA for kernel modules.
 	 * The modules are required to be linked in the negative 2GB of
 	 * the address space.  If we run out of KVA in this region then
 	 * pmap_growkernel() will need to allocate page table pages to map
 	 * the entire 512GB of KVA space which is an unnecessary tax on
 	 * physical memory.
 	 *
 	 * Secondly, device memory mapped as part of setting up the low-
 	 * level console(s) is taken from KVA, starting at virtual_avail.
 	 * This is because cninit() is called after pmap_bootstrap() but
 	 * before vm_init() and pmap_init(). 20MB for a frame buffer is
 	 * not uncommon.
 	 */
 	pt_pages += 32;		/* 64MB additional slop. */
 #endif
 	nkpt = pt_pages;
 }
 
 /*
  * Returns the proper write/execute permission for a physical page that is
  * part of the initial boot allocations.
  *
  * If the page has kernel text, it is marked as read-only. If the page has
  * kernel read-only data, it is marked as read-only/not-executable. If the
  * page has only read-write data, it is marked as read-write/not-executable.
  * If the page is below/above the kernel range, it is marked as read-write.
  *
  * This function operates on 2M pages, since we map the kernel space that
  * way.
  */
 static inline pt_entry_t
 bootaddr_rwx(vm_paddr_t pa)
 {
 	/*
 	 * The kernel is loaded at a 2MB-aligned address, and memory below that
 	 * need not be executable.  The .bss section is padded to a 2MB
 	 * boundary, so memory following the kernel need not be executable
 	 * either.  Preloaded kernel modules have their mapping permissions
 	 * fixed up by the linker.
 	 */
 	if (pa < trunc_2mpage(kernphys + btext - KERNSTART) ||
 	    pa >= trunc_2mpage(kernphys + _end - KERNSTART))
 		return (X86_PG_RW | pg_nx);
 
 	/*
 	 * The linker should ensure that the read-only and read-write
 	 * portions don't share the same 2M page, so this shouldn't
 	 * impact read-only data. However, in any case, any page with
 	 * read-write data needs to be read-write.
 	 */
 	if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART))
 		return (X86_PG_RW | pg_nx);
 
 	/*
 	 * Mark any 2M page containing kernel text as read-only. Mark
 	 * other pages with read-only data as read-only and not executable.
 	 * (It is likely a small portion of the read-only data section will
 	 * be marked as read-only, but executable. This should be acceptable
 	 * since the read-only protection will keep the data from changing.)
 	 * Note that fixups to the .text section will still work until we
 	 * set CR0.WP.
 	 */
 	if (pa < round_2mpage(kernphys + etext - KERNSTART))
 		return (0);
 	return (pg_nx);
 }
 
 static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
 	pd_entry_t *pd_p;
 	pdp_entry_t *pdp_p;
 	pml4_entry_t *p4_p;
 	uint64_t DMPDkernphys;
 	vm_paddr_t pax;
 #ifdef KASAN
 	pt_entry_t *pt_p;
 	uint64_t KASANPDphys, KASANPTphys, KASANphys;
 	vm_offset_t kasankernbase;
 	int kasankpdpi, kasankpdi, nkasanpte;
 #endif
 	int i, j, ndm1g, nkpdpe, nkdmpde;
 
 	/* Allocate page table pages for the direct map */
 	ndmpdp = howmany(ptoa(Maxmem), NBPDP);
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp = 4;
 	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
 	if (ndmpdpphys > NDMPML4E) {
 		/*
 		 * Each NDMPML4E allows 512 GB, so limit to that,
 		 * and then readjust ndmpdp and ndmpdpphys.
 		 */
 		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
 		Maxmem = atop(NDMPML4E * NBPML4);
 		ndmpdpphys = NDMPML4E;
 		ndmpdp = NDMPML4E * NPDEPG;
 	}
 	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
 	ndm1g = 0;
 	if ((amd_feature & AMDID_PAGE1GB) != 0) {
 		/*
 		 * Calculate the number of 1G pages that will fully fit in
 		 * Maxmem.
 		 */
 		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
 
 		/*
 		 * Allocate 2M pages for the kernel. These will be used in
 		 * place of the one or more 1G pages from ndm1g that maps
 		 * kernel memory into DMAP.
 		 */
 		nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART +
 		    kernphys - rounddown2(kernphys, NBPDP), NBPDP);
 		DMPDkernphys = allocpages(firstaddr, nkdmpde);
 	}
 	if (ndm1g < ndmpdp)
 		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 
 	/* Allocate pages. */
 	KPML4phys = allocpages(firstaddr, 1);
 	KPDPphys = allocpages(firstaddr, NKPML4E);
 #ifdef KASAN
 	KASANPDPphys = allocpages(firstaddr, NKASANPML4E);
 	KASANPDphys = allocpages(firstaddr, 1);
 #endif
 #ifdef KMSAN
 	/*
 	 * The KMSAN shadow maps are initially left unpopulated, since there is
 	 * no need to shadow memory above KERNBASE.
 	 */
 	KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E);
 	KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E);
 #endif
 
 	/*
 	 * Allocate the initial number of kernel page table pages required to
 	 * bootstrap.  We defer this until after all memory-size dependent
 	 * allocations are done (e.g. direct map), so that we don't have to
 	 * build in too much slop in our estimate.
 	 *
 	 * Note that when NKPML4E > 1, we have an empty page underneath
 	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
 	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
 	 */
 	nkpt_init(*firstaddr);
 	nkpdpe = NKPDPE(nkpt);
 
 	KPTphys = allocpages(firstaddr, nkpt);
 	KPDphys = allocpages(firstaddr, nkpdpe);
 
 #ifdef KASAN
 	nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE);
 	KASANPTphys = allocpages(firstaddr, nkasanpte);
 	KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG);
 #endif
 
 	/*
 	 * Connect the zero-filled PT pages to their PD entries.  This
 	 * implicitly maps the PT pages at their correct locations within
 	 * the PTmap.
 	 */
 	pd_p = (pd_entry_t *)KPDphys;
 	for (i = 0; i < nkpt; i++)
 		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 	/*
 	 * Map from start of the kernel in physical memory (staging
 	 * area) to the end of loader preallocated memory using 2MB
 	 * pages.  This replaces some of the PD entries created above.
 	 * For compatibility, identity map 2M at the start.
 	 */
 	pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A |
 	    X86_PG_RW | pg_nx;
 	for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) {
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
 		    X86_PG_A | bootaddr_rwx(pax);
 	}
 
 	/*
 	 * Because we map the physical blocks in 2M pages, adjust firstaddr
 	 * to record the physical blocks we've actually mapped into kernel
 	 * virtual address space.
 	 */
 	if (*firstaddr < round_2mpage(KERNend))
 		*firstaddr = round_2mpage(KERNend);
 
 	/* And connect up the PD to the PDP (leaving room for L4 pages) */
 	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
 	for (i = 0; i < nkpdpe; i++)
 		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 #ifdef KASAN
 	kasankernbase = kasan_md_addr_to_shad(KERNBASE);
 	kasankpdpi = pmap_pdpe_index(kasankernbase);
 	kasankpdi = pmap_pde_index(kasankernbase);
 
 	pdp_p = (pdp_entry_t *)KASANPDPphys;
 	pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx);
 
 	pd_p = (pd_entry_t *)KASANPDphys;
 	for (i = 0; i < nkasanpte; i++)
 		pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW |
 		    X86_PG_V | pg_nx;
 
 	pt_p = (pt_entry_t *)KASANPTphys;
 	for (i = 0; i < nkasanpte * NPTEPG; i++)
 		pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
 		    X86_PG_M | X86_PG_A | pg_nx;
 #endif
 
 	/*
 	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
 	 * the end of physical memory is not aligned to a 1GB page boundary,
 	 * then the residual physical memory is mapped with 2MB pages.  Later,
 	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
 	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
 	 * that are partially used. 
 	 */
 	pd_p = (pd_entry_t *)DMPDphys;
 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
 		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A | pg_nx;
 	}
 	pdp_p = (pdp_entry_t *)DMPDPphys;
 	for (i = 0; i < ndm1g; i++) {
 		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A | pg_nx;
 	}
 	for (j = 0; i < ndmpdp; i++, j++) {
 		pdp_p[i] = DMPDphys + ptoa(j);
 		pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx;
 	}
 
 	/*
 	 * Instead of using a 1G page for the memory containing the kernel,
 	 * use 2M pages with read-only and no-execute permissions.  (If using 1G
 	 * pages, this will partially overwrite the PDPEs above.)
 	 */
 	if (ndm1g > 0) {
 		pd_p = (pd_entry_t *)DMPDkernphys;
 		for (i = 0, pax = rounddown2(kernphys, NBPDP);
 		    i < NPDEPG * nkdmpde; i++, pax += NBPDR) {
 			pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
 			    X86_PG_A | pg_nx | bootaddr_rwx(pax);
 		}
 		j = rounddown2(kernphys, NBPDP) >> PDPSHIFT;
 		for (i = 0; i < nkdmpde; i++) {
 			pdp_p[i + j] = (DMPDkernphys + ptoa(i)) |
 			    X86_PG_RW | X86_PG_V | pg_nx;
 		}
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
 	p4_p = (pml4_entry_t *)KPML4phys;
 	p4_p[PML4PML4I] = KPML4phys;
 	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
 
 #ifdef KASAN
 	/* Connect the KASAN shadow map slots up to the PML4. */
 	for (i = 0; i < NKASANPML4E; i++) {
 		p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i);
 		p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
 	}
 #endif
 
 #ifdef KMSAN
 	/* Connect the KMSAN shadow map slots up to the PML4. */
 	for (i = 0; i < NKMSANSHADPML4E; i++) {
 		p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i);
 		p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
 	}
 
 	/* Connect the KMSAN origin map slots up to the PML4. */
 	for (i = 0; i < NKMSANORIGPML4E; i++) {
 		p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i);
 		p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
 	}
 #endif
 
 	/* Connect the Direct Map slots up to the PML4. */
 	for (i = 0; i < ndmpdpphys; i++) {
 		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
 		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
 	}
 
 	/* Connect the KVA slots up to the PML4 */
 	for (i = 0; i < NKPML4E; i++) {
 		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
 		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
 	}
 
 	kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On amd64 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(vm_paddr_t *firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *pcpu_pte;
 	struct region_descriptor r_gdt;
 	uint64_t cr4, pcpu_phys;
 	u_long res;
 	int i;
 
 	KERNend = *firstaddr;
 	res = atop(KERNend - (vm_paddr_t)kernphys);
 
 	if (!pti)
 		pg_g = X86_PG_G;
 
 	/*
 	 * Create an initial set of page tables to run the kernel in.
 	 */
 	create_pagetables(firstaddr);
 
 	pcpu_phys = allocpages(firstaddr, MAXCPU);
 
 	/*
 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
 	 * preallocated kernel page table pages so that vm_page structures
 	 * representing these pages will be created.  The vm_page structures
 	 * are required for promotion of the corresponding kernel virtual
 	 * addresses to superpage mappings.
 	 */
 	vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 
 	/*
 	 * Account for the virtual addresses mapped by create_pagetables().
 	 */
 	virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
 	    (vm_paddr_t)kernphys);
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Enable PG_G global pages, then switch to the kernel page
 	 * table from the bootstrap page table.  After the switch, it
 	 * is possible to enable SMEP and SMAP since PG_U bits are
 	 * correct now.
 	 */
 	cr4 = rcr4();
 	cr4 |= CR4_PGE;
 	load_cr4(cr4);
 	load_cr3(KPML4phys);
 	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
 		cr4 |= CR4_SMEP;
 	if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
 		cr4 |= CR4_SMAP;
 	load_cr4(cr4);
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 * Count bootstrap data as being resident in case any of this data is
 	 * later unmapped (using pmap_remove()) and freed.
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pmltop = kernel_pml4;
 	kernel_pmap->pm_cr3 = KPML4phys;
 	kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	kernel_pmap->pm_stats.resident_count = res;
 	kernel_pmap->pm_flags = pmap_flags;
 
  	/*
 	 * Initialize the TLB invalidations generation number lock.
 	 */
 	mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 	/*
 	 * Crashdump maps.  The first page is reused as CMAP1 for the
 	 * memory test.
 	 */
 	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
 	CADDR1 = crashdumpmap;
 
 	SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU);
 	virtual_avail = va;
 
 	for (i = 0; i < MAXCPU; i++) {
 		pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW |
 		    pg_g | pg_nx | X86_PG_M | X86_PG_A;
 	}
 
 	/*
 	 * Re-initialize PCPU area for BSP after switching.
 	 * Make hardware use gdt and common_tss from the new PCPU.
 	 */
 	STAILQ_INIT(&cpuhead);
 	wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
 	pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu));
 	amd64_bsp_pcpu_init1(&__pcpu[0]);
 	amd64_bsp_ist_init(&__pcpu[0]);
 	__pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
 	    IOPERM_BITMAP_SIZE;
 	memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT *
 	    sizeof(struct user_segment_descriptor));
 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss;
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
 	r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
 	r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
 	lgdt(&r_gdt);
 	wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
 	__pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic;
 	__pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id;
 
 	/*
 	 * Initialize the PAT MSR.
 	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
 	 * side-effect, invalidates stale PG_G TLB entries that might
 	 * have been created in our pre-boot environment.
 	 */
 	pmap_init_pat();
 
 	/* Initialize TLB Context Id. */
 	if (pmap_pcid_enabled) {
 		for (i = 0; i < MAXCPU; i++) {
 			kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
 			kernel_pmap->pm_pcids[i].pm_gen = 1;
 		}
 
 		/*
 		 * PMAP_PCID_KERN + 1 is used for initialization of
 		 * proc0 pmap.  The pmap' pcid state might be used by
 		 * EFIRT entry before first context switch, so it
 		 * needs to be valid.
 		 */
 		PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
 		PCPU_SET(pcid_gen, 1);
 
 		/*
 		 * pcpu area for APs is zeroed during AP startup.
 		 * pc_pcid_next and pc_pcid_gen are initialized by AP
 		 * during pcpu setup.
 		 */
 		load_cr4(rcr4() | CR4_PCIDE);
 	}
 }
 
 /*
  * Setup the PAT MSR.
  */
 void
 pmap_init_pat(void)
 {
 	uint64_t pat_msr;
 	u_long cr0, cr4;
 	int i;
 
 	/* Bail if this CPU doesn't implement PAT. */
 	if ((cpu_feature & CPUID_PAT) == 0)
 		panic("no PAT??");
 
 	/* Set default PAT index table. */
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_index[i] = -1;
 	pat_index[PAT_WRITE_BACK] = 0;
 	pat_index[PAT_WRITE_THROUGH] = 1;
 	pat_index[PAT_UNCACHEABLE] = 3;
 	pat_index[PAT_WRITE_COMBINING] = 6;
 	pat_index[PAT_WRITE_PROTECTED] = 5;
 	pat_index[PAT_UNCACHED] = 2;
 
 	/*
 	 * Initialize default PAT entries.
 	 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
 	 * Program 5 and 6 as WP and WC.
 	 *
 	 * Leave 4 and 7 as WB and UC.  Note that a recursive page table
 	 * mapping for a 2M page uses a PAT value with the bit 3 set due
 	 * to its overload with PG_PS.
 	 */
 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(2, PAT_UNCACHED) |
 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
 	    PAT_VALUE(4, PAT_WRITE_BACK) |
 	    PAT_VALUE(5, PAT_WRITE_PROTECTED) |
 	    PAT_VALUE(6, PAT_WRITE_COMBINING) |
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 
 	/* Disable PGE. */
 	cr4 = rcr4();
 	load_cr4(cr4 & ~CR4_PGE);
 
 	/* Disable caches (CD = 1, NW = 0). */
 	cr0 = rcr0();
 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
 
 	/* Flushes caches and TLBs. */
 	wbinvd();
 	invltlb();
 
 	/* Update PAT and index table. */
 	wrmsr(MSR_PAT, pat_msr);
 
 	/* Flush caches and TLBs again. */
 	wbinvd();
 	invltlb();
 
 	/* Restore caches and PGE. */
 	load_cr0(cr0);
 	load_cr4(cr4);
 }
 
 vm_page_t
 pmap_page_alloc_below_4g(bool zeroed)
 {
-	vm_page_t m;
-
-	m = vm_page_alloc_contig(NULL, 0, (zeroed ? VM_ALLOC_ZERO : 0) |
-	    VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ,
-	    1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
-	if (m != NULL && zeroed && (m->flags & PG_ZERO) == 0)
-		pmap_zero_page(m);
-	return (m);
+	return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0),
+	    1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT));
 }
 
 extern const char la57_trampoline[], la57_trampoline_gdt_desc[],
     la57_trampoline_gdt[], la57_trampoline_end[];
 
 static void
 pmap_bootstrap_la57(void *arg __unused)
 {
 	char *v_code;
 	pml5_entry_t *v_pml5;
 	pml4_entry_t *v_pml4;
 	pdp_entry_t *v_pdp;
 	pd_entry_t *v_pd;
 	pt_entry_t *v_pt;
 	vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5;
 	void (*la57_tramp)(uint64_t pml5);
 	struct region_descriptor r_gdt;
 
 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0)
 		return;
 	TUNABLE_INT_FETCH("vm.pmap.la57", &la57);
 	if (!la57)
 		return;
 
 	r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
 	r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
 
 	m_code = pmap_page_alloc_below_4g(true);
 	v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code));
 	m_pml5 = pmap_page_alloc_below_4g(true);
 	KPML5phys = VM_PAGE_TO_PHYS(m_pml5);
 	v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys);
 	m_pml4 = pmap_page_alloc_below_4g(true);
 	v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
 	m_pdp = pmap_page_alloc_below_4g(true);
 	v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
 	m_pd = pmap_page_alloc_below_4g(true);
 	v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd));
 	m_pt = pmap_page_alloc_below_4g(true);
 	v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt));
 
 	/*
 	 * Map m_code 1:1, it appears below 4G in KVA due to physical
 	 * address being below 4G.  Since kernel KVA is in upper half,
 	 * the pml4e should be zero and free for temporary use.
 	 */
 	kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
 	    VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
 	    X86_PG_M;
 	v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] =
 	    VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A |
 	    X86_PG_M;
 	v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] =
 	    VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A |
 	    X86_PG_M;
 	v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] =
 	    VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A |
 	    X86_PG_M;
 
 	/*
 	 * Add pml5 entry at top of KVA pointing to existing pml4 table,
 	 * entering all existing kernel mappings into level 5 table.
 	 */
 	v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
 	    X86_PG_RW | X86_PG_A | X86_PG_M | pg_g;
 
 	/*
 	 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on.
 	 */
 	v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] =
 	    VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A |
 	    X86_PG_M;
 	v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
 	    VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
 	    X86_PG_M;
 
 	/*
 	 * Copy and call the 48->57 trampoline, hope we return there, alive.
 	 */
 	bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline);
 	*(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) =
 	    la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code);
 	la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code);
 	invlpg((vm_offset_t)la57_tramp);
 	la57_tramp(KPML5phys);
 
 	/*
 	 * gdt was necessary reset, switch back to our gdt.
 	 */
 	lgdt(&r_gdt);
 	wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
 
 	/*
 	 * Now unmap the trampoline, and free the pages.
 	 * Clear pml5 entry used for 1:1 trampoline mapping.
 	 */
 	pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]);
 	invlpg((vm_offset_t)v_code);
 	vm_page_free(m_code);
 	vm_page_free(m_pdp);
 	vm_page_free(m_pd);
 	vm_page_free(m_pt);
 
 	/* 
 	 * Recursively map PML5 to itself in order to get PTmap and
 	 * PDmap.
 	 */
 	v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx;
 
 	kernel_pmap->pm_cr3 = KPML5phys;
 	kernel_pmap->pm_pmltop = v_pml5;
 	pmap_pt_page_count_adj(kernel_pmap, 1);
 }
 SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL);
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pat_mode = PAT_WRITE_BACK;
 }
 
 static int pmap_allow_2m_x_ept;
 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
     &pmap_allow_2m_x_ept, 0,
     "Allow executable superpage mappings in EPT");
 
 void
 pmap_allow_2m_x_ept_recalculate(void)
 {
 	/*
 	 * SKL002, SKL012S.  Since the EPT format is only used by
 	 * Intel CPUs, the vendor check is merely a formality.
 	 */
 	if (!(cpu_vendor_id != CPU_VENDOR_INTEL ||
 	    (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 ||
 	    (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    (CPUID_TO_MODEL(cpu_id) == 0x26 ||	/* Atoms */
 	    CPUID_TO_MODEL(cpu_id) == 0x27 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x35 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x36 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x37 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x86 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x1c ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4a ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4c ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4d ||
 	    CPUID_TO_MODEL(cpu_id) == 0x5a ||
 	    CPUID_TO_MODEL(cpu_id) == 0x5c ||
 	    CPUID_TO_MODEL(cpu_id) == 0x5d ||
 	    CPUID_TO_MODEL(cpu_id) == 0x5f ||
 	    CPUID_TO_MODEL(cpu_id) == 0x6e ||
 	    CPUID_TO_MODEL(cpu_id) == 0x7a ||
 	    CPUID_TO_MODEL(cpu_id) == 0x57 ||	/* Knights */
 	    CPUID_TO_MODEL(cpu_id) == 0x85))))
 		pmap_allow_2m_x_ept = 1;
 	TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept);
 }
 
 static bool
 pmap_allow_2m_x_page(pmap_t pmap, bool executable)
 {
 
 	return (pmap->pm_type != PT_EPT || !executable ||
 	    !pmap_allow_2m_x_ept);
 }
 
 #ifdef NUMA
 static void
 pmap_init_pv_table(void)
 {
 	struct pmap_large_md_page *pvd;
 	vm_size_t s;
 	long start, end, highest, pv_npg;
 	int domain, i, j, pages;
 
 	/*
 	 * We strongly depend on the size being a power of two, so the assert
 	 * is overzealous. However, should the struct be resized to a
 	 * different power of two, the code below needs to be revisited.
 	 */
 	CTASSERT((sizeof(*pvd) == 64));
 
 	/*
 	 * Calculate the size of the array.
 	 */
 	pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end;
 	pv_npg = howmany(pmap_last_pa, NBPDR);
 	s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page);
 	s = round_page(s);
 	pv_table = (struct pmap_large_md_page *)kva_alloc(s);
 	if (pv_table == NULL)
 		panic("%s: kva_alloc failed\n", __func__);
 
 	/*
 	 * Iterate physical segments to allocate space for respective pages.
 	 */
 	highest = -1;
 	s = 0;
 	for (i = 0; i < vm_phys_nsegs; i++) {
 		end = vm_phys_segs[i].end / NBPDR;
 		domain = vm_phys_segs[i].domain;
 
 		if (highest >= end)
 			continue;
 
 		start = highest + 1;
 		pvd = &pv_table[start];
 
 		pages = end - start + 1;
 		s = round_page(pages * sizeof(*pvd));
 		highest = start + (s / sizeof(*pvd)) - 1;
 
 		for (j = 0; j < s; j += PAGE_SIZE) {
 			vm_page_t m = vm_page_alloc_noobj_domain(domain, 0);
 			if (m == NULL)
 				panic("failed to allocate PV table page");
 			pmap_qenter((vm_offset_t)pvd + j, &m, 1);
 		}
 
 		for (j = 0; j < s / sizeof(*pvd); j++) {
 			rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
 			TAILQ_INIT(&pvd->pv_page.pv_list);
 			pvd->pv_page.pv_gen = 0;
 			pvd->pv_page.pat_mode = 0;
 			pvd->pv_invl_gen = 0;
 			pvd++;
 		}
 	}
 	pvd = &pv_dummy_large;
 	rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
 	TAILQ_INIT(&pvd->pv_page.pv_list);
 	pvd->pv_page.pv_gen = 0;
 	pvd->pv_page.pat_mode = 0;
 	pvd->pv_invl_gen = 0;
 }
 #else
 static void
 pmap_init_pv_table(void)
 {
 	vm_size_t s;
 	long i, pv_npg;
 
 	/*
 	 * Initialize the pool of pv list locks.
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)pv_npg * sizeof(struct md_page);
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 	TAILQ_INIT(&pv_dummy.pv_list);
 }
 #endif
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t m, mpte;
 	int error, i, ret, skz63;
 
 	/* L1TF, reserve page @0 unconditionally */
 	vm_page_blacklist_add(0, bootverbose);
 
 	/* Detect bare-metal Skylake Server and Skylake-X. */
 	if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
 		/*
 		 * Skylake-X errata SKZ63. Processor May Hang When
 		 * Executing Code In an HLE Transaction Region between
 		 * 40000000H and 403FFFFFH.
 		 *
 		 * Mark the pages in the range as preallocated.  It
 		 * seems to be impossible to distinguish between
 		 * Skylake Server and Skylake X.
 		 */
 		skz63 = 1;
 		TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
 		if (skz63 != 0) {
 			if (bootverbose)
 				printf("SKZ63: skipping 4M RAM starting "
 				    "at physical 1G\n");
 			for (i = 0; i < atop(0x400000); i++) {
 				ret = vm_page_blacklist_add(0x40000000 +
 				    ptoa(i), FALSE);
 				if (!ret && bootverbose)
 					printf("page at %#lx already used\n",
 					    0x40000000 + ptoa(i));
 			}
 		}
 	}
 
 	/* IFU */
 	pmap_allow_2m_x_ept_recalculate();
 
 	/*
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */ 
 	PMAP_LOCK(kernel_pmap);
 	for (i = 0; i < nkpt; i++) {
 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 		KASSERT(mpte >= vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
 		    ("pmap_init: page table page is out of range"));
 		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 		mpte->ref_count = 1;
 
 		/*
 		 * Collect the page table pages that were replaced by a 2MB
 		 * page in create_pagetables().  They are zero filled.
 		 */
 		if ((i == 0 ||
 		    kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) &&
 		    pmap_insert_pt_page(kernel_pmap, mpte, false))
 			panic("pmap_init: pmap_insert_pt_page failed");
 	}
 	PMAP_UNLOCK(kernel_pmap);
 	vm_wire_add(nkpt);
 
 	/*
 	 * If the kernel is running on a virtual machine, then it must assume
 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
 	 * be prepared for the hypervisor changing the vendor and family that
 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
 	 * 10h Erratum 383 is enabled if the processor's feature set does not
 	 * include at least one feature that is only supported by older Intel
 	 * or newer AMD processors.
 	 */
 	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
 	    AMDID2_FMA4)) == 0)
 		workaround_erratum383 = 1;
 
 	/*
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 	if (pg_ps_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = NBPDR;
 		if ((amd_feature & AMDID_PAGE1GB) != 0) {
 			KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
 			    ("pmap_init: can't assign to pagesizes[2]"));
 			pagesizes[2] = NBPDP;
 		}
 	}
 
 	/*
 	 * Initialize pv chunk lists.
 	 */
 	for (i = 0; i < PMAP_MEMDOM; i++) {
 		mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF);
 		TAILQ_INIT(&pv_chunks[i].pvc_list);
 	}
 	pmap_init_pv_table();
 
 	pmap_initialized = 1;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == 0)
 			continue;
 		/* Make the direct map consistent */
 		if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) {
 			(void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
 			    ppim->sz, ppim->mode);
 		}
 		if (!bootverbose)
 			continue;
 		printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
 		    ppim->pa, ppim->va, ppim->sz, ppim->mode);
 	}
 
 	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 	    (vmem_addr_t *)&qframe);
 	if (error != 0)
 		panic("qframe allocation failed");
 
 	lm_ents = 8;
 	TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
 	if (lm_ents > LMEPML4I - LMSPML4I + 1)
 		lm_ents = LMEPML4I - LMSPML4I + 1;
 #ifdef KMSAN
 	if (lm_ents > KMSANORIGPML4I - LMSPML4I) {
 		printf(
 	    "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n",
 		    lm_ents, KMSANORIGPML4I - LMSPML4I);
 		lm_ents = KMSANORIGPML4I - LMSPML4I;
 	}
 #endif
 	if (bootverbose)
 		printf("pmap: large map %u PML4 slots (%lu GB)\n",
 		    lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
 	if (lm_ents != 0) {
 		large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
 		    (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
 		if (large_vmem == NULL) {
 			printf("pmap: cannot create large map\n");
 			lm_ents = 0;
 		}
 		for (i = 0; i < lm_ents; i++) {
 			m = pmap_large_map_getptp_unlocked();
 			/* XXXKIB la57 */
 			kernel_pml4[LMSPML4I + i] = X86_PG_V |
 			    X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
 			    VM_PAGE_TO_PHYS(m);
 		}
 	}
 }
 
 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries,
     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0,
     "Maximum number of PML4 entries for use by large map (tunable).  "
     "Each entry corresponds to 512GB of address space.");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "2MB page mapping counters");
 
 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions);
 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions,
     CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions");
 
 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings);
 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_pde_mappings, "2MB page mappings");
 
 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures);
 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_pde_p_failures, "2MB page promotion failures");
 
 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions);
 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_pde_promotions, "2MB page promotions");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "1GB page mapping counters");
 
 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions);
 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_pdpe_demotions, "1GB page demotions");
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 static pt_entry_t
 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
 {
 	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		/* Verify that both PAT bits are not set at the same time */
 		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
 		    ("Invalid PAT bits in entry %#lx", entry));
 
 		/* Swap the PAT bits if one of them is set */
 		if ((entry & x86_pat_bits) != 0)
 			entry ^= x86_pat_bits;
 		break;
 	case PT_EPT:
 		/*
 		 * Nothing to do - the memory attributes are represented
 		 * the same way for regular pages and superpages.
 		 */
 		break;
 	default:
 		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
 	}
 
 	return (entry);
 }
 
 boolean_t
 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
 {
 
 	return (mode >= 0 && mode < PAT_INDEX_SIZE &&
 	    pat_index[(int)mode] >= 0);
 }
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 int
 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 {
 	int cache_bits, pat_flag, pat_idx;
 
 	if (!pmap_is_valid_memattr(pmap, mode))
 		panic("Unknown caching mode %d\n", mode);
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		/* The PAT bit is different for PTE's and PDE's. */
 		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 
 		/* Map the caching mode to a PAT index. */
 		pat_idx = pat_index[mode];
 
 		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 		cache_bits = 0;
 		if (pat_idx & 0x4)
 			cache_bits |= pat_flag;
 		if (pat_idx & 0x2)
 			cache_bits |= PG_NC_PCD;
 		if (pat_idx & 0x1)
 			cache_bits |= PG_NC_PWT;
 		break;
 
 	case PT_EPT:
 		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
 		break;
 
 	default:
 		panic("unsupported pmap type %d", pmap->pm_type);
 	}
 
 	return (cache_bits);
 }
 
 static int
 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
 {
 	int mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
 		break;
 	case PT_EPT:
 		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
 		break;
 	default:
 		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static int
 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde)
 {
 	int pat_flag, pat_idx;
 
 	pat_idx = 0;
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		/* The PAT bit is different for PTE's and PDE's. */
 		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 
 		if ((pte & pat_flag) != 0)
 			pat_idx |= 0x4;
 		if ((pte & PG_NC_PCD) != 0)
 			pat_idx |= 0x2;
 		if ((pte & PG_NC_PWT) != 0)
 			pat_idx |= 0x1;
 		break;
 	case PT_EPT:
 		if ((pte & EPT_PG_IGNORE_PAT) != 0)
 			panic("EPT PTE %#lx has no PAT memory type", pte);
 		pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3;
 		break;
 	}
 
 	/* See pmap_init_pat(). */
 	if (pat_idx == 4)
 		pat_idx = 0;
 	if (pat_idx == 7)
 		pat_idx = 3;
 
 	return (pat_idx);
 }
 
 bool
 pmap_ps_enabled(pmap_t pmap)
 {
 
 	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
 }
 
 static void
 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 		break;
 	case PT_RVI:
 	case PT_EPT:
 		/*
 		 * XXX
 		 * This is a little bogus since the generation number is
 		 * supposed to be bumped up when a region of the address
 		 * space is invalidated in the page tables.
 		 *
 		 * In this case the old PDE entry is valid but yet we want
 		 * to make sure that any mappings using the old entry are
 		 * invalidated in the TLB.
 		 *
 		 * The reason this works as expected is because we rendezvous
 		 * "all" host cpus and force any vcpu context to exit as a
 		 * side-effect.
 		 */
 		atomic_add_long(&pmap->pm_eptgen, 1);
 		break;
 	default:
 		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
 	}
 	pde_store(pde, newpde);
 }
 
 /*
  * After changing the page size for the specified virtual address in the page
  * table, flush the corresponding entries from the processor's TLB.  Only the
  * calling processor's TLB is affected.
  *
  * The calling thread must be pinned to a processor.
  */
 static void
 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 {
 	pt_entry_t PG_G;
 
 	if (pmap_type_guest(pmap))
 		return;
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
 
 	PG_G = pmap_global_bit(pmap);
 
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
 		invlpg(va);
 	else if ((newpde & PG_G) == 0)
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB
 		 * because there are too many to flush individually.
 		 */
 		invltlb();
 	else {
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB,
 		 * including any global (PG_G) mappings.
 		 */
 		invltlb_glob();
 	}
 }
 
 /*
  * The amd64 pmap uses different approaches to TLB invalidation
  * depending on the kernel configuration, available hardware features,
  * and known hardware errata.  The kernel configuration option that
  * has the greatest operational impact on TLB invalidation is PTI,
  * which is enabled automatically on affected Intel CPUs.  The most
  * impactful hardware features are first PCID, and then INVPCID
  * instruction presence.  PCID usage is quite different for PTI
  * vs. non-PTI.
  *
  * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate
  *   the Meltdown bug in some Intel CPUs.  Under PTI, each user address
  *   space is served by two page tables, user and kernel.  The user
  *   page table only maps user space and a kernel trampoline.  The
  *   kernel trampoline includes the entirety of the kernel text but
  *   only the kernel data that is needed to switch from user to kernel
  *   mode.  The kernel page table maps the user and kernel address
  *   spaces in their entirety.  It is identical to the per-process
  *   page table used in non-PTI mode.
  *
  *   User page tables are only used when the CPU is in user mode.
  *   Consequently, some TLB invalidations can be postponed until the
  *   switch from kernel to user mode.  In contrast, the user
  *   space part of the kernel page table is used for copyout(9), so
  *   TLB invalidations on this page table cannot be similarly postponed.
  *
  *   The existence of a user mode page table for the given pmap is
  *   indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in
  *   which case pm_ucr3 contains the %cr3 register value for the user
  *   mode page table's root.
  *
  * * The pm_active bitmask indicates which CPUs currently have the
  *   pmap active.  A CPU's bit is set on context switch to the pmap, and
  *   cleared on switching off this CPU.  For the kernel page table,
  *   the pm_active field is immutable and contains all CPUs.  The
  *   kernel page table is always logically active on every processor,
  *   but not necessarily in use by the hardware, e.g., in PTI mode.
  *
  *   When requesting invalidation of virtual addresses with
  *   pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to
  *   all CPUs recorded as active in pm_active.  Updates to and reads
  *   from pm_active are not synchronized, and so they may race with
  *   each other.  Shootdown handlers are prepared to handle the race.
  *
  * * PCID is an optional feature of the long mode x86 MMU where TLB
  *   entries are tagged with the 'Process ID' of the address space
  *   they belong to.  This feature provides a limited namespace for
  *   process identifiers, 12 bits, supporting 4095 simultaneous IDs
  *   total.
  *
  *   Allocation of a PCID to a pmap is done by an algorithm described
  *   in section 15.12, "Other TLB Consistency Algorithms", of
  *   Vahalia's book "Unix Internals".  A PCID cannot be allocated for
  *   the whole lifetime of a pmap in pmap_pinit() due to the limited
  *   namespace.  Instead, a per-CPU, per-pmap PCID is assigned when
  *   the CPU is about to start caching TLB entries from a pmap,
  *   i.e., on the context switch that activates the pmap on the CPU.
  *
  *   The PCID allocator maintains a per-CPU, per-pmap generation
  *   count, pm_gen, which is incremented each time a new PCID is
  *   allocated.  On TLB invalidation, the generation counters for the
  *   pmap are zeroed, which signals the context switch code that the
  *   previously allocated PCID is no longer valid.  Effectively,
  *   zeroing any of these counters triggers a TLB shootdown for the
  *   given CPU/address space, due to the allocation of a new PCID.
  *
  *   Zeroing can be performed remotely.  Consequently, if a pmap is
  *   inactive on a CPU, then a TLB shootdown for that pmap and CPU can
  *   be initiated by an ordinary memory access to reset the target
  *   CPU's generation count within the pmap.  The CPU initiating the
  *   TLB shootdown does not need to send an IPI to the target CPU.
  *
  * * PTI + PCID.  The available PCIDs are divided into two sets: PCIDs
  *   for complete (kernel) page tables, and PCIDs for user mode page
  *   tables.  A user PCID value is obtained from the kernel PCID value
  *   by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT).
  *
  *   User space page tables are activated on return to user mode, by
  *   loading pm_ucr3 into %cr3.  If the PCPU(ucr3_load_mask) requests
  *   clearing bit 63 of the loaded ucr3, this effectively causes
  *   complete invalidation of the user mode TLB entries for the
  *   current pmap.  In which case, local invalidations of individual
  *   pages in the user page table are skipped.
  *
  * * Local invalidation, all modes.  If the requested invalidation is
  *   for a specific address or the total invalidation of a currently
  *   active pmap, then the TLB is flushed using INVLPG for a kernel
  *   page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a
  *   user space page table(s).
  *
  *   If the INVPCID instruction is available, it is used to flush entries
  *   from the kernel page table.
  *
  * * mode: PTI disabled, PCID present.  The kernel reserves PCID 0 for its
  *   address space, all other 4095 PCIDs are used for user mode spaces
  *   as described above.  A context switch allocates a new PCID if
  *   the recorded PCID is zero or the recorded generation does not match
  *   the CPU's generation, effectively flushing the TLB for this address space.
  *   Total remote invalidation is performed by zeroing pm_gen for all CPUs.
  *	local user page: INVLPG
  *	local kernel page: INVLPG
  *	local user total: INVPCID(CTX)
  *	local kernel total: INVPCID(CTXGLOB) or invltlb_glob()
  *	remote user page, inactive pmap: zero pm_gen
  *	remote user page, active pmap: zero pm_gen + IPI:INVLPG
  *	(Both actions are required to handle the aforementioned pm_active races.)
  *	remote kernel page: IPI:INVLPG
  *	remote user total, inactive pmap: zero pm_gen
  *	remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or
  *          reload %cr3)
  *	(See note above about pm_active races.)
  *	remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob())
  *
  * PTI enabled, PCID present.
  *	local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3)
  *          for upt
  *	local kernel page: INVLPG
  *	local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE
  *          on loading UCR3 into %cr3 for upt
  *	local kernel total: INVPCID(CTXGLOB) or invltlb_glob()
  *	remote user page, inactive pmap: zero pm_gen
  *	remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt,
  *          INVPCID(ADDR) for upt)
  *	remote kernel page: IPI:INVLPG
  *	remote user total, inactive pmap: zero pm_gen
  *	remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt,
  *          clear PCID_SAVE on loading UCR3 into $cr3 for upt)
  *	remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob())
  *
  *  No PCID.
  *	local user page: INVLPG
  *	local kernel page: INVLPG
  *	local user total: reload %cr3
  *	local kernel total: invltlb_glob()
  *	remote user page, inactive pmap: -
  *	remote user page, active pmap: IPI:INVLPG
  *	remote kernel page: IPI:INVLPG
  *	remote user total, inactive pmap: -
  *	remote user total, active pmap: IPI:(reload %cr3)
  *	remote kernel total: IPI:invltlb_glob()
  *  Since on return to user mode, the reload of %cr3 with ucr3 causes
  *  TLB invalidation, no specific action is required for user page table.
  *
  * EPT.  EPT pmaps do not map KVA, all mappings are userspace.
  * XXX TODO
  */
 
 #ifdef SMP
 /*
  * Interrupt the cpus that are executing in the guest context.
  * This will force the vcpu to exit and the cached EPT mappings
  * will be invalidated by the host before the next vmresume.
  */
 static __inline void
 pmap_invalidate_ept(pmap_t pmap)
 {
 	smr_seq_t goal;
 	int ipinum;
 
 	sched_pin();
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("pmap_invalidate_ept: absurd pm_active"));
 
 	/*
 	 * The TLB mappings associated with a vcpu context are not
 	 * flushed each time a different vcpu is chosen to execute.
 	 *
 	 * This is in contrast with a process's vtop mappings that
 	 * are flushed from the TLB on each context switch.
 	 *
 	 * Therefore we need to do more than just a TLB shootdown on
 	 * the active cpus in 'pmap->pm_active'. To do this we keep
 	 * track of the number of invalidations performed on this pmap.
 	 *
 	 * Each vcpu keeps a cache of this counter and compares it
 	 * just before a vmresume. If the counter is out-of-date an
 	 * invept will be done to flush stale mappings from the TLB.
 	 *
 	 * To ensure that all vCPU threads have observed the new counter
 	 * value before returning, we use SMR.  Ordering is important here:
 	 * the VMM enters an SMR read section before loading the counter
 	 * and after updating the pm_active bit set.  Thus, pm_active is
 	 * a superset of active readers, and any reader that has observed
 	 * the goal has observed the new counter value.
 	 */
 	atomic_add_long(&pmap->pm_eptgen, 1);
 
 	goal = smr_advance(pmap->pm_eptsmr);
 
 	/*
 	 * Force the vcpu to exit and trap back into the hypervisor.
 	 */
 	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
 	ipi_selected(pmap->pm_active, ipinum);
 	sched_unpin();
 
 	/*
 	 * Ensure that all active vCPUs will observe the new generation counter
 	 * value before executing any more guest instructions.
 	 */
 	smr_wait(pmap->pm_eptsmr, goal);
 }
 
 static cpuset_t
 pmap_invalidate_cpu_mask(pmap_t pmap)
 {
 	return (pmap == kernel_pmap ? all_cpus : pmap->pm_active);
 }
 
 static inline void
 pmap_invalidate_preipi_pcid(pmap_t pmap)
 {
 	u_int cpuid, i;
 
 	sched_pin();
 
 	cpuid = PCPU_GET(cpuid);
 	if (pmap != PCPU_GET(curpmap))
 		cpuid = 0xffffffff;	/* An impossible value */
 
 	CPU_FOREACH(i) {
 		if (cpuid != i)
 			pmap->pm_pcids[i].pm_gen = 0;
 	}
 
 	/*
 	 * The fence is between stores to pm_gen and the read of the
 	 * pm_active mask.  We need to ensure that it is impossible
 	 * for us to miss the bit update in pm_active and
 	 * simultaneously observe a non-zero pm_gen in
 	 * pmap_activate_sw(), otherwise TLB update is missed.
 	 * Without the fence, IA32 allows such an outcome.  Note that
 	 * pm_active is updated by a locked operation, which provides
 	 * the reciprocal fence.
 	 */
 	atomic_thread_fence_seq_cst();
 }
 
 static void
 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused)
 {
 	sched_pin();
 }
 
 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t))
 {
 	return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid :
 	    pmap_invalidate_preipi_nopcid);
 }
 
 static inline void
 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va,
     const bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 	u_int cpuid;
 
 	/*
 	 * Because pm_pcid is recalculated on a context switch, we
 	 * must ensure there is no preemption, not just pinning.
 	 * Otherwise, we might use a stale value below.
 	 */
 	CRITICAL_ASSERT(curthread);
 
 	/*
 	 * No need to do anything with user page tables invalidation
 	 * if there is no user page table, or invalidation is deferred
 	 * until the return to userspace.  ucr3_load_mask is stable
 	 * because we have preemption disabled.
 	 */
 	if (pmap->pm_ucr3 == PMAP_NO_CR3 ||
 	    PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK)
 		return;
 
 	cpuid = PCPU_GET(cpuid);
 
 	pcid = pmap->pm_pcids[cpuid].pm_pcid;
 	if (invpcid_works1) {
 		d.pcid = pcid | PMAP_PCID_USER_PT;
 		d.pad = 0;
 		d.addr = va;
 		invpcid(&d, INVPCID_ADDR);
 	} else {
 		kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 		ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 		pmap_pti_pcid_invlpg(ucr3, kcr3, va);
 	}
 }
 
 static void
 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va)
 {
 	pmap_invalidate_page_pcid_cb(pmap, va, true);
 }
 
 static void
 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va)
 {
 	pmap_invalidate_page_pcid_cb(pmap, va, false);
 }
 
 static void
 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t))
 {
 	if (pmap_pcid_enabled)
 		return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb :
 		    pmap_invalidate_page_pcid_noinvpcid_cb);
 	return (pmap_invalidate_page_nopcid_cb);
 }
 
 static void
 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
     vm_offset_t addr2 __unused)
 {
 	if (pmap == kernel_pmap) {
 		invlpg(va);
 	} else if (pmap == PCPU_GET(curpmap)) {
 		invlpg(va);
 		pmap_invalidate_page_cb(pmap, va);
 	}
 }
 
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
 
 	pmap_invalidate_preipi(pmap);
 	smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap,
 	    pmap_invalidate_page_curcpu_cb);
 }
 
 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
 
 static void
 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     const bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 	u_int cpuid;
 
 	CRITICAL_ASSERT(curthread);
 
 	if (pmap != PCPU_GET(curpmap) ||
 	    pmap->pm_ucr3 == PMAP_NO_CR3 ||
 	    PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK)
 		return;
 
 	cpuid = PCPU_GET(cpuid);
 
 	pcid = pmap->pm_pcids[cpuid].pm_pcid;
 	if (invpcid_works1) {
 		d.pcid = pcid | PMAP_PCID_USER_PT;
 		d.pad = 0;
 		for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE)
 			invpcid(&d, INVPCID_ADDR);
 	} else {
 		kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 		ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 		pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
 	}
 }
 
 static void
 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva,
     vm_offset_t eva)
 {
 	pmap_invalidate_range_pcid_cb(pmap, sva, eva, true);
 }
 
 static void
 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva,
     vm_offset_t eva)
 {
 	pmap_invalidate_range_pcid_cb(pmap, sva, eva, false);
 }
 
 static void
 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused,
     vm_offset_t eva __unused)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t,
     vm_offset_t))
 {
 	if (pmap_pcid_enabled)
 		return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb :
 		    pmap_invalidate_range_pcid_noinvpcid_cb);
 	return (pmap_invalidate_range_nopcid_cb);
 }
 
 static void
 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 	} else if (pmap == PCPU_GET(curpmap)) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		pmap_invalidate_range_cb(pmap, sva, eva);
 	}
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 		pmap_invalidate_all(pmap);
 		return;
 	}
 
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
 
 	pmap_invalidate_preipi(pmap);
 	smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap,
 	    pmap_invalidate_range_curcpu_cb);
 }
 
 static inline void
 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3;
 	uint32_t pcid;
 	u_int cpuid;
 
 	if (pmap == kernel_pmap) {
 		if (invpcid_works1) {
 			bzero(&d, sizeof(d));
 			invpcid(&d, INVPCID_CTXGLOB);
 		} else {
 			invltlb_glob();
 		}
 	} else if (pmap == PCPU_GET(curpmap)) {
 		CRITICAL_ASSERT(curthread);
 		cpuid = PCPU_GET(cpuid);
 
 		pcid = pmap->pm_pcids[cpuid].pm_pcid;
 		if (invpcid_works1) {
 			d.pcid = pcid;
 			d.pad = 0;
 			d.addr = 0;
 			invpcid(&d, INVPCID_CTX);
 		} else {
 			kcr3 = pmap->pm_cr3 | pcid;
 			load_cr3(kcr3);
 		}
 		if (pmap->pm_ucr3 != PMAP_NO_CR3)
 			PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
 	}
 }
 
 static void
 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap)
 {
 	pmap_invalidate_all_pcid_cb(pmap, true);
 }
 
 static void
 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap)
 {
 	pmap_invalidate_all_pcid_cb(pmap, false);
 }
 
 static void
 pmap_invalidate_all_nopcid_cb(pmap_t pmap)
 {
 	if (pmap == kernel_pmap)
 		invltlb_glob();
 	else if (pmap == PCPU_GET(curpmap))
 		invltlb();
 }
 
 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t))
 {
 	if (pmap_pcid_enabled)
 		return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb :
 		    pmap_invalidate_all_pcid_noinvpcid_cb);
 	return (pmap_invalidate_all_nopcid_cb);
 }
 
 static void
 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused,
     vm_offset_t addr2 __unused)
 {
 	pmap_invalidate_all_cb(pmap);
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
 
 	pmap_invalidate_preipi(pmap);
 	smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap,
 	    pmap_invalidate_all_curcpu_cb);
 }
 
 static void
 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused,
     vm_offset_t addr2 __unused)
 {
 	wbinvd();
 }
 
 void
 pmap_invalidate_cache(void)
 {
 	sched_pin();
 	smp_cache_flush(pmap_invalidate_cache_curcpu_cb);
 }
 
 struct pde_action {
 	cpuset_t invalidate;	/* processors that invalidate their TLB */
 	pmap_t pmap;
 	vm_offset_t va;
 	pd_entry_t *pde;
 	pd_entry_t newpde;
 	u_int store;		/* processor that updates the PDE */
 };
 
 static void
 pmap_update_pde_action(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (act->store == PCPU_GET(cpuid))
 		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
 }
 
 static void
 pmap_update_pde_teardown(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
 }
 
 /*
  * Change the page size for the specified virtual address in a way that
  * prevents any possibility of the TLB ever having two entries that map the
  * same virtual address using different page sizes.  This is the recommended
  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
  * machine check exception for a TLB state that is improperly diagnosed as a
  * hardware error.
  */
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 	struct pde_action act;
 	cpuset_t active, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	other_cpus = all_cpus;
 	CPU_CLR(cpuid, &other_cpus);
 	if (pmap == kernel_pmap || pmap_type_guest(pmap)) 
 		active = all_cpus;
 	else {
 		active = pmap->pm_active;
 	}
 	if (CPU_OVERLAP(&active, &other_cpus)) { 
 		act.store = cpuid;
 		act.invalidate = active;
 		act.va = va;
 		act.pmap = pmap;
 		act.pde = pde;
 		act.newpde = newpde;
 		CPU_SET(cpuid, &active);
 		smp_rendezvous_cpus(active,
 		    smp_no_rendezvous_barrier, pmap_update_pde_action,
 		    pmap_update_pde_teardown, &act);
 	} else {
 		pmap_update_pde_store(pmap, pde, newpde);
 		if (CPU_ISSET(cpuid, &active))
 			pmap_update_pde_invalidate(pmap, va, newpde);
 	}
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, invalidation functions.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		invlpg(va);
 		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
 		    pmap->pm_ucr3 != PMAP_NO_CR3) {
 			critical_enter();
 			pcid = pmap->pm_pcids[0].pm_pcid;
 			if (invpcid_works) {
 				d.pcid = pcid | PMAP_PCID_USER_PT;
 				d.pad = 0;
 				d.addr = va;
 				invpcid(&d, INVPCID_ADDR);
 			} else {
 				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 				ucr3 = pmap->pm_ucr3 | pcid |
 				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
 			}
 			critical_exit();
 		}
 	} else if (pmap_pcid_enabled)
 		pmap->pm_pcids[0].pm_gen = 0;
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct invpcid_descr d;
 	vm_offset_t addr;
 	uint64_t kcr3, ucr3;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
 		    pmap->pm_ucr3 != PMAP_NO_CR3) {
 			critical_enter();
 			if (invpcid_works) {
 				d.pcid = pmap->pm_pcids[0].pm_pcid |
 				    PMAP_PCID_USER_PT;
 				d.pad = 0;
 				d.addr = sva;
 				for (; d.addr < eva; d.addr += PAGE_SIZE)
 					invpcid(&d, INVPCID_ADDR);
 			} else {
 				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
 				    pm_pcid | CR3_PCID_SAVE;
 				ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
 				    pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
 			}
 			critical_exit();
 		}
 	} else if (pmap_pcid_enabled) {
 		pmap->pm_pcids[0].pm_gen = 0;
 	}
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap) {
 		if (pmap_pcid_enabled && invpcid_works) {
 			bzero(&d, sizeof(d));
 			invpcid(&d, INVPCID_CTXGLOB);
 		} else {
 			invltlb_glob();
 		}
 	} else if (pmap == PCPU_GET(curpmap)) {
 		if (pmap_pcid_enabled) {
 			critical_enter();
 			if (invpcid_works) {
 				d.pcid = pmap->pm_pcids[0].pm_pcid;
 				d.pad = 0;
 				d.addr = 0;
 				invpcid(&d, INVPCID_CTX);
 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 					d.pcid |= PMAP_PCID_USER_PT;
 					invpcid(&d, INVPCID_CTX);
 				}
 			} else {
 				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 					ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
 					    0].pm_pcid | PMAP_PCID_USER_PT;
 					pmap_pti_pcid_invalidate(ucr3, kcr3);
 				} else
 					load_cr3(kcr3);
 			}
 			critical_exit();
 		} else {
 			invltlb();
 		}
 	} else if (pmap_pcid_enabled) {
 		pmap->pm_pcids[0].pm_gen = 0;
 	}
 }
 
 PMAP_INLINE void
 pmap_invalidate_cache(void)
 {
 
 	wbinvd();
 }
 
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	pmap_update_pde_store(pmap, pde, newpde);
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
 		pmap_update_pde_invalidate(pmap, va, newpde);
 	else
 		pmap->pm_pcids[0].pm_gen = 0;
 }
 #endif /* !SMP */
 
 static void
 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
 {
 
 	/*
 	 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
 	 * by a promotion that did not invalidate the 512 4KB page mappings
 	 * that might exist in the TLB.  Consequently, at this point, the TLB
 	 * may hold both 4KB and 2MB page mappings for the address range [va,
 	 * va + NBPDR).  Therefore, the entire range must be invalidated here.
 	 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
 	 * 4KB page mappings for the address range [va, va + NBPDR), and so a
 	 * single INVLPG suffices to invalidate the 2MB page mapping from the
 	 * TLB.
 	 */
 	if ((pde & PG_PROMOTED) != 0)
 		pmap_invalidate_range(pmap, va, va + NBPDR - 1);
 	else
 		pmap_invalidate_page(pmap, va);
 }
 
 DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
     (vm_offset_t sva, vm_offset_t eva))
 {
 
 	if ((cpu_feature & CPUID_SS) != 0)
 		return (pmap_invalidate_cache_range_selfsnoop);
 	if ((cpu_feature & CPUID_CLFSH) != 0)
 		return (pmap_force_invalidate_cache_range);
 	return (pmap_invalidate_cache_range_all);
 }
 
 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
 
 static void
 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
 {
 
 	KASSERT((sva & PAGE_MASK) == 0,
 	    ("pmap_invalidate_cache_range: sva not page-aligned"));
 	KASSERT((eva & PAGE_MASK) == 0,
 	    ("pmap_invalidate_cache_range: eva not page-aligned"));
 }
 
 static void
 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
 {
 
 	pmap_invalidate_cache_range_check_align(sva, eva);
 }
 
 void
 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 {
 
 	sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
 
 	/*
 	 * XXX: Some CPUs fault, hang, or trash the local APIC
 	 * registers if we use CLFLUSH on the local APIC range.  The
 	 * local APIC is always uncached, so we don't need to flush
 	 * for that range anyway.
 	 */
 	if (pmap_kextract(sva) == lapic_paddr)
 		return;
 
 	if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
 		/*
 		 * Do per-cache line flush.  Use a locked
 		 * instruction to insure that previous stores are
 		 * included in the write-back.  The processor
 		 * propagates flush to other processors in the cache
 		 * coherence domain.
 		 */
 		atomic_thread_fence_seq_cst();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflushopt(sva);
 		atomic_thread_fence_seq_cst();
 	} else {
 		/*
 		 * Writes are ordered by CLFLUSH on Intel CPUs.
 		 */
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflush(sva);
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 	}
 }
 
 static void
 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
 {
 
 	pmap_invalidate_cache_range_check_align(sva, eva);
 	pmap_invalidate_cache();
 }
 
 /*
  * Remove the specified set of pages from the data and instruction caches.
  *
  * In contrast to pmap_invalidate_cache_range(), this function does not
  * rely on the CPU's self-snoop feature, because it is intended for use
  * when moving pages into a different cache domain.
  */
 void
 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 {
 	vm_offset_t daddr, eva;
 	int i;
 	bool useclflushopt;
 
 	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 	    ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
 		pmap_invalidate_cache();
 	else {
 		if (useclflushopt)
 			atomic_thread_fence_seq_cst();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (i = 0; i < count; i++) {
 			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
 			eva = daddr + PAGE_SIZE;
 			for (; daddr < eva; daddr += cpu_clflush_line_size) {
 				if (useclflushopt)
 					clflushopt(daddr);
 				else
 					clflush(daddr);
 			}
 		}
 		if (useclflushopt)
 			atomic_thread_fence_seq_cst();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 	}
 }
 
 void
 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
 {
 
 	pmap_invalidate_cache_range_check_align(sva, eva);
 
 	if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
 		pmap_force_invalidate_cache_range(sva, eva);
 		return;
 	}
 
 	/* See comment in pmap_force_invalidate_cache_range(). */
 	if (pmap_kextract(sva) == lapic_paddr)
 		return;
 
 	atomic_thread_fence_seq_cst();
 	for (; sva < eva; sva += cpu_clflush_line_size)
 		clwb(sva);
 	atomic_thread_fence_seq_cst();
 }
 
 void
 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
 {
 	pt_entry_t *pte;
 	vm_offset_t vaddr;
 	int error, pte_bits;
 
 	KASSERT((spa & PAGE_MASK) == 0,
 	    ("pmap_flush_cache_phys_range: spa not page-aligned"));
 	KASSERT((epa & PAGE_MASK) == 0,
 	    ("pmap_flush_cache_phys_range: epa not page-aligned"));
 
 	if (spa < dmaplimit) {
 		pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN(
 		    dmaplimit, epa)));
 		if (dmaplimit >= epa)
 			return;
 		spa = dmaplimit;
 	}
 
 	pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW |
 	    X86_PG_V;
 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 	    &vaddr);
 	KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 	pte = vtopte(vaddr);
 	for (; spa < epa; spa += PAGE_SIZE) {
 		sched_pin();
 		pte_store(pte, spa | pte_bits);
 		invlpg(vaddr);
 		/* XXXKIB atomic inside flush_cache_range are excessive */
 		pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
 		sched_unpin();
 	}
 	vmem_free(kernel_arena, vaddr, PAGE_SIZE);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	vm_paddr_t pa;
 
 	pa = 0;
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 		if ((*pdpe & PG_PS) != 0)
 			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
 		else {
 			pde = pmap_pdpe_to_pde(pdpe, va);
 			if ((*pde & PG_V) != 0) {
 				if ((*pde & PG_PS) != 0) {
 					pa = (*pde & PG_PS_FRAME) |
 					    (va & PDRMASK);
 				} else {
 					pte = pmap_pde_to_pte(pde, va);
 					pa = (*pte & PG_FRAME) |
 					    (va & PAGE_MASK);
 				}
 			}
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pdp_entry_t pdpe, *pdpep;
 	pd_entry_t pde, *pdep;
 	pt_entry_t pte, PG_RW, PG_V;
 	vm_page_t m;
 
 	m = NULL;
 	PG_RW = pmap_rw_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 
 	pdpep = pmap_pdpe(pmap, va);
 	if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0)
 		goto out;
 	if ((pdpe & PG_PS) != 0) {
 		if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)
 			goto out;
 		m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK));
 		goto check_page;
 	}
 
 	pdep = pmap_pdpe_to_pde(pdpep, va);
 	if (pdep == NULL || ((pde = *pdep) & PG_V) == 0)
 		goto out;
 	if ((pde & PG_PS) != 0) {
 		if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)
 			goto out;
 		m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK));
 		goto check_page;
 	}
 
 	pte = *pmap_pde_to_pte(pdep, va);
 	if ((pte & PG_V) == 0 ||
 	    ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0))
 		goto out;
 	m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 
 check_page:
 	if (m != NULL && !vm_page_wire_mapped(m))
 		m = NULL;
 out:
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pd_entry_t pde;
 	vm_paddr_t pa;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else if (PMAP_ADDRESS_IN_LARGEMAP(va)) {
 		pa = pmap_large_map_kextract(va);
 	} else {
 		pde = *vtopde(va);
 		if (pde & PG_PS) {
 			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
 		} else {
 			/*
 			 * Beware of a concurrent promotion that changes the
 			 * PDE at this point!  For example, vtopte() must not
 			 * be used to access the PTE because it would use the
 			 * new PDE.  It is, however, safe to use the old PDE
 			 * because the page table page is preserved by the
 			 * promotion.
 			 */
 			pa = *pmap_pde_to_pte(&pde, va);
 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 		}
 	}
 	return (pa);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx);
 }
 
 static __inline void
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 	int cache_bits;
 
 	pte = vtopte(va);
 	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
 	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx | cache_bits);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return PHYS_TO_DMAP(start);
 }
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, pa, *pte;
 	vm_page_t m;
 	int cache_bits;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		m = *ma++;
 		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
 		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
 			pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V);
 		}
 		pte++;
 	}
 	if (__predict_false((oldpte & X86_PG_V) != 0))
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  *
  * If "promoted" is false, then the page table page "mpte" must be zero filled.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
 	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
 }
 
 /*
  * Decrements a page table page's reference count, which is used to record the
  * number of valid page table entries within the page.  If the reference count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->ref_count;
 	if (m->ref_count == 0) {
 		_pmap_unwire_ptp(pmap, va, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 	pml5_entry_t *pml5;
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pd;
 	vm_page_t pdpg, pdppg, pml4pg;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * unmap the page table page
 	 */
 	if (m->pindex >= NUPDE + NUPDPE + NUPML4E) {
 		/* PML4 page */
 		MPASS(pmap_is_la57(pmap));
 		pml5 = pmap_pml5e(pmap, va);
 		*pml5 = 0;
 		if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) {
 			pml5 = pmap_pml5e_u(pmap, va);
 			*pml5 = 0;
 		}
 	} else if (m->pindex >= NUPDE + NUPDPE) {
 		/* PDP page */
 		pml4 = pmap_pml4e(pmap, va);
 		*pml4 = 0;
 		if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
 		    va <= VM_MAXUSER_ADDRESS) {
 			pml4 = pmap_pml4e_u(pmap, va);
 			*pml4 = 0;
 		}
 	} else if (m->pindex >= NUPDE) {
 		/* PD page */
 		pdp = pmap_pdpe(pmap, va);
 		*pdp = 0;
 	} else {
 		/* PTE page */
 		pd = pmap_pde(pmap, va);
 		*pd = 0;
 	}
 	if (m->pindex < NUPDE) {
 		/* We just released a PT, unhold the matching PD */
 		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdpg, free);
 	} else if (m->pindex < NUPDE + NUPDPE) {
 		/* We just released a PD, unhold the matching PDP */
 		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdppg, free);
 	} else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) {
 		/* We just released a PDP, unhold the matching PML4 */
 		pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pml4pg, free);
 	}
 
 	pmap_pt_page_count_adj(pmap, -1);
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the reference count.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return (pmap_unwire_ptp(pmap, va, mpte, free));
 }
 
 /*
  * Release a page table page reference after a failed attempt to create a
  * mapping.
  */
 static void
 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
 {
 	struct spglist free;
 
 	SLIST_INIT(&free);
 	if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 		/*
 		 * Although "va" was never mapped, paging-structure caches
 		 * could nonetheless have entries that refer to the freed
 		 * page table pages.  Invalidate those entries.
 		 */
 		pmap_invalidate_page(pmap, va);
 		vm_page_free_pages_toq(&free, true);
 	}
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 	struct proc *p;
 	struct thread *td;
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pmltop = kernel_pmap->pm_pmltop;
 	pmap->pm_pmltopu = NULL;
 	pmap->pm_cr3 = kernel_pmap->pm_cr3;
 	/* hack to keep pmap_pti_pcid_invalidate() alive */
 	pmap->pm_ucr3 = PMAP_NO_CR3;
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_flags = pmap_flags;
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1;
 		pmap->pm_pcids[i].pm_gen = 1;
 	}
 	pmap_activate_boot(pmap);
 	td = curthread;
 	if (pti) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		p->p_md.md_flags |= P_MD_KPTI;
 		PROC_UNLOCK(p);
 	}
 	pmap_thread_init_invl_gen(td);
 
 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 		pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
 		    sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 	}
 }
 
 void
 pmap_pinit_pml4(vm_page_t pml4pg)
 {
 	pml4_entry_t *pm_pml4;
 	int i;
 
 	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 
 	/* Wire in kernel global address entries. */
 	for (i = 0; i < NKPML4E; i++) {
 		pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
 		    X86_PG_V;
 	}
 #ifdef KASAN
 	for (i = 0; i < NKASANPML4E; i++) {
 		pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW |
 		    X86_PG_V | pg_nx;
 	}
 #endif
 #ifdef KMSAN
 	for (i = 0; i < NKMSANSHADPML4E; i++) {
 		pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) |
 		    X86_PG_RW | X86_PG_V | pg_nx;
 	}
 	for (i = 0; i < NKMSANORIGPML4E; i++) {
 		pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) |
 		    X86_PG_RW | X86_PG_V | pg_nx;
 	}
 #endif
 	for (i = 0; i < ndmpdpphys; i++) {
 		pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
 		    X86_PG_V;
 	}
 
 	/* install self-referential address mapping entry(s) */
 	pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
 	    X86_PG_A | X86_PG_M;
 
 	/* install large map entries if configured */
 	for (i = 0; i < lm_ents; i++)
 		pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i];
 }
 
 void
 pmap_pinit_pml5(vm_page_t pml5pg)
 {
 	pml5_entry_t *pm_pml5;
 
 	pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg));
 
 	/*
 	 * Add pml5 entry at top of KVA pointing to existing pml4 table,
 	 * entering all existing kernel mappings into level 5 table.
 	 */
 	pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
 	    X86_PG_RW | X86_PG_A | X86_PG_M | pg_g |
 	    pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
 
 	/* 
 	 * Install self-referential address mapping entry.
 	 */
 	pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) |
 	    X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A |
 	    pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
 }
 
 static void
 pmap_pinit_pml4_pti(vm_page_t pml4pgu)
 {
 	pml4_entry_t *pm_pml4u;
 	int i;
 
 	pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu));
 	for (i = 0; i < NPML4EPG; i++)
 		pm_pml4u[i] = pti_pml4[i];
 }
 
 static void
 pmap_pinit_pml5_pti(vm_page_t pml5pgu)
 {
 	pml5_entry_t *pm_pml5u;
 
 	pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu));
 	pagezero(pm_pml5u);
 
 	/*
 	 * Add pml5 entry at top of KVA pointing to existing pml4 pti
 	 * table, entering all kernel mappings needed for usermode
 	 * into level 5 table.
 	 */
 	pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] =
 	    pmap_kextract((vm_offset_t)pti_pml4) |
 	    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g |
 	    pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
 }
 
 /* Allocate a page table page and do related bookkeeping */
 static vm_page_t
 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags)
 {
 	vm_page_t m;
 
 	m = vm_page_alloc_noobj(flags);
 	if (__predict_false(m == NULL))
 		return (NULL);
 	m->pindex = pindex;
 	pmap_pt_page_count_adj(pmap, 1);
 	return (m);
 }
 
 static void
 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled)
 {
 	/*
 	 * This function assumes the page will need to be unwired,
 	 * even though the counterpart allocation in pmap_alloc_pt_page()
 	 * doesn't enforce VM_ALLOC_WIRED.  However, all current uses
 	 * of pmap_free_pt_page() require unwiring.  The case in which
 	 * a PT page doesn't require unwiring because its ref_count has
 	 * naturally reached 0 is handled through _pmap_unwire_ptp().
 	 */
 	vm_page_unwire_noq(m);
 	if (zerofilled)
 		vm_page_free_zero(m);
 	else
 		vm_page_free(m);
 
 	pmap_pt_page_count_adj(pmap, -1);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 int
 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 {
 	vm_page_t pmltop_pg, pmltop_pgu;
 	vm_paddr_t pmltop_phys;
 	int i;
 
 	/*
 	 * allocate the page directory page
 	 */
 	pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO |
 	    VM_ALLOC_WAITOK);
 
 	pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg);
 	pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys);
 
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
 	}
 	pmap->pm_cr3 = PMAP_NO_CR3;	/* initialize to an invalid value */
 	pmap->pm_ucr3 = PMAP_NO_CR3;
 	pmap->pm_pmltopu = NULL;
 
 	pmap->pm_type = pm_type;
 
 	/*
 	 * Do not install the host kernel mappings in the nested page
 	 * tables. These mappings are meaningless in the guest physical
 	 * address space.
 	 * Install minimal kernel mappings in PTI case.
 	 */
 	switch (pm_type) {
 	case PT_X86:
 		pmap->pm_cr3 = pmltop_phys;
 		if (pmap_is_la57(pmap))
 			pmap_pinit_pml5(pmltop_pg);
 		else
 			pmap_pinit_pml4(pmltop_pg);
 		if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) {
 			pmltop_pgu = pmap_alloc_pt_page(NULL, 0,
 			    VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
 			pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP(
 			    VM_PAGE_TO_PHYS(pmltop_pgu));
 			if (pmap_is_la57(pmap))
 				pmap_pinit_pml5_pti(pmltop_pgu);
 			else
 				pmap_pinit_pml4_pti(pmltop_pgu);
 			pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu);
 		}
 		if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 			rangeset_init(&pmap->pm_pkru, pkru_dup_range,
 			    pkru_free_range, pmap, M_NOWAIT);
 		}
 		break;
 	case PT_EPT:
 	case PT_RVI:
 		pmap->pm_eptsmr = smr_create("pmap", 0, 0);
 		break;
 	}
 
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_flags = flags;
 	pmap->pm_eptgen = 0;
 
 	return (1);
 }
 
 int
 pmap_pinit(pmap_t pmap)
 {
 
 	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
 }
 
 static void
 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte)
 {
 	vm_page_t mpg;
 	struct spglist free;
 
 	mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 	if (mpg->ref_count != 0)
 		return;
 	SLIST_INIT(&free);
 	_pmap_unwire_ptp(pmap, va, mpg, &free);
 	pmap_invalidate_page(pmap, va);
 	vm_page_free_pages_toq(&free, true);
 }
 
 static pml4_entry_t *
 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
     bool addref)
 {
 	vm_pindex_t pml5index;
 	pml5_entry_t *pml5;
 	pml4_entry_t *pml4;
 	vm_page_t pml4pg;
 	pt_entry_t PG_V;
 	bool allocated;
 
 	if (!pmap_is_la57(pmap))
 		return (&pmap->pm_pmltop[pmap_pml4e_index(va)]);
 
 	PG_V = pmap_valid_bit(pmap);
 	pml5index = pmap_pml5e_index(va);
 	pml5 = &pmap->pm_pmltop[pml5index];
 	if ((*pml5 & PG_V) == 0) {
 		if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp,
 		    va) == NULL)
 			return (NULL);
 		allocated = true;
 	} else {
 		allocated = false;
 	}
 	pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME);
 	pml4 = &pml4[pmap_pml4e_index(va)];
 	if ((*pml4 & PG_V) == 0) {
 		pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME);
 		if (allocated && !addref)
 			pml4pg->ref_count--;
 		else if (!allocated && addref)
 			pml4pg->ref_count++;
 	}
 	return (pml4);
 }
 
 static pdp_entry_t *
 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
     bool addref)
 {
 	vm_page_t pdppg;
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pt_entry_t PG_V;
 	bool allocated;
 
 	PG_V = pmap_valid_bit(pmap);
 
 	pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false);
 	if (pml4 == NULL)
 		return (NULL);
 
 	if ((*pml4 & PG_V) == 0) {
 		/* Have to allocate a new pdp, recurse */
 		if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp,
 		    va) == NULL) {
 			if (pmap_is_la57(pmap))
 				pmap_allocpte_free_unref(pmap, va,
 				    pmap_pml5e(pmap, va));
 			return (NULL);
 		}
 		allocated = true;
 	} else {
 		allocated = false;
 	}
 	pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 	pdp = &pdp[pmap_pdpe_index(va)];
 	if ((*pdp & PG_V) == 0) {
 		pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 		if (allocated && !addref)
 			pdppg->ref_count--;
 		else if (!allocated && addref)
 			pdppg->ref_count++;
 	}
 	return (pdp);
 }
 
 /*
  * The ptepindexes, i.e. page indices, of the page table pages encountered
  * while translating virtual address va are defined as follows:
  * - for the page table page (last level),
  *      ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT,
  *   in other words, it is just the index of the PDE that maps the page
  *   table page.
  * - for the page directory page,
  *      ptepindex = NUPDE (number of userland PD entries) +
  *          (pmap_pde_index(va) >> NPDEPGSHIFT)
  *   i.e. index of PDPE is put after the last index of PDE,
  * - for the page directory pointer page,
  *      ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT +
  *          NPML4EPGSHIFT),
  *   i.e. index of pml4e is put after the last index of PDPE,
  * - for the PML4 page (if LA57 mode is enabled),
  *      ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >>
  *          (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT),
  *   i.e. index of pml5e is put after the last index of PML4E.
  *
  * Define an order on the paging entries, where all entries of the
  * same height are put together, then heights are put from deepest to
  * root.  Then ptexpindex is the sequential number of the
  * corresponding paging entry in this order.
  *
  * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of
  * LA57 paging structures even in LA48 paging mode. Moreover, the
  * ptepindexes are calculated as if the paging structures were 5-level
  * regardless of the actual mode of operation.
  *
  * The root page at PML4/PML5 does not participate in this indexing scheme,
  * since it is statically allocated by pmap_pinit() and not by pmap_allocpte().
  */
 static vm_page_t
 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
     vm_offset_t va)
 {
 	vm_pindex_t pml5index, pml4index;
 	pml5_entry_t *pml5, *pml5u;
 	pml4_entry_t *pml4, *pml4u;
 	pdp_entry_t *pdp;
 	pd_entry_t *pd;
 	vm_page_t m, pdpg;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	/*
 	 * Allocate a page table page.
 	 */
 	m = pmap_alloc_pt_page(pmap, ptepindex,
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 	if (m == NULL)
 		return (NULL);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 	if (ptepindex >= NUPDE + NUPDPE + NUPML4E) {
 		MPASS(pmap_is_la57(pmap));
 
 		pml5index = pmap_pml5e_index(va);
 		pml5 = &pmap->pm_pmltop[pml5index];
 		KASSERT((*pml5 & PG_V) == 0,
 		    ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5));
 		*pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 		if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) {
 			if (pmap->pm_ucr3 != PMAP_NO_CR3)
 				*pml5 |= pg_nx;
 
 			pml5u = &pmap->pm_pmltopu[pml5index];
 			*pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
 			    PG_A | PG_M;
 		}
 	} else if (ptepindex >= NUPDE + NUPDPE) {
 		pml4index = pmap_pml4e_index(va);
 		/* Wire up a new PDPE page */
 		pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true);
 		if (pml4 == NULL) {
 			pmap_free_pt_page(pmap, m, true);
 			return (NULL);
 		}
 		KASSERT((*pml4 & PG_V) == 0,
 		    ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4));
 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 		if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
 		    pml4index < NUPML4E) {
 			/*
 			 * PTI: Make all user-space mappings in the
 			 * kernel-mode page table no-execute so that
 			 * we detect any programming errors that leave
 			 * the kernel-mode page table active on return
 			 * to user space.
 			 */
 			if (pmap->pm_ucr3 != PMAP_NO_CR3)
 				*pml4 |= pg_nx;
 
 			pml4u = &pmap->pm_pmltopu[pml4index];
 			*pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
 			    PG_A | PG_M;
 		}
 	} else if (ptepindex >= NUPDE) {
 		/* Wire up a new PDE page */
 		pdp = pmap_allocpte_getpdp(pmap, lockp, va, true);
 		if (pdp == NULL) {
 			pmap_free_pt_page(pmap, m, true);
 			return (NULL);
 		}
 		KASSERT((*pdp & PG_V) == 0,
 		    ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp));
 		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 	} else {
 		/* Wire up a new PTE page */
 		pdp = pmap_allocpte_getpdp(pmap, lockp, va, false);
 		if (pdp == NULL) {
 			pmap_free_pt_page(pmap, m, true);
 			return (NULL);
 		}
 		if ((*pdp & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
 		  if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va),
 		      lockp, va) == NULL) {
 				pmap_allocpte_free_unref(pmap, va,
 				    pmap_pml4e(pmap, va));
 				pmap_free_pt_page(pmap, m, true);
 				return (NULL);
 			}
 		} else {
 			/* Add reference to the pd page */
 			pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 			pdpg->ref_count++;
 		}
 		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 
 		/* Now we know where the page directory page is */
 		pd = &pd[pmap_pde_index(va)];
 		KASSERT((*pd & PG_V) == 0,
 		    ("pmap %p va %#lx pd %#lx", pmap, va, *pd));
 		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 	}
 
 	return (m);
 }
 
 /*
  * This routine is called if the desired page table page does not exist.
  *
  * If page table page allocation fails, this routine may sleep before
  * returning NULL.  It sleeps only if a lock pointer was given.  Sleep
  * occurs right before returning to the caller. This way, we never
  * drop pmap lock to sleep while a page table page has ref_count == 0,
  * which prevents the page from being freed under us.
  */
 static vm_page_t
 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
     vm_offset_t va)
 {
 	vm_page_t m;
 
 	m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va);
 	if (m == NULL && lockp != NULL) {
 		RELEASE_PV_LIST_LOCK(lockp);
 		PMAP_UNLOCK(pmap);
 		PMAP_ASSERT_NOT_IN_DI();
 		vm_wait(NULL);
 		PMAP_LOCK(pmap);
 	}
 	return (m);
 }
 
 static pd_entry_t *
 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
     struct rwlock **lockp)
 {
 	pdp_entry_t *pdpe, PG_V;
 	pd_entry_t *pde;
 	vm_page_t pdpg;
 	vm_pindex_t pdpindex;
 
 	PG_V = pmap_valid_bit(pmap);
 
 retry:
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 		pde = pmap_pdpe_to_pde(pdpe, va);
 		if (va < VM_MAXUSER_ADDRESS) {
 			/* Add a reference to the pd page. */
 			pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 			pdpg->ref_count++;
 		} else
 			pdpg = NULL;
 	} else if (va < VM_MAXUSER_ADDRESS) {
 		/* Allocate a pd page. */
 		pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT;
 		pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va);
 		if (pdpg == NULL) {
 			if (lockp != NULL)
 				goto retry;
 			else
 				return (NULL);
 		}
 		pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 		pde = &pde[pmap_pde_index(va)];
 	} else
 		panic("pmap_alloc_pde: missing page table page for va %#lx",
 		    va);
 	*pdpgp = pdpg;
 	return (pde);
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pd, PG_V;
 	vm_page_t m;
 
 	PG_V = pmap_valid_bit(pmap);
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_pde_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	pd = pmap_pde(pmap, va);
 
 	/*
 	 * This supports switching from a 2MB page to a
 	 * normal 4K page.
 	 */
 	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
 			/*
 			 * Invalidation of the 2MB page mapping may have caused
 			 * the deallocation of the underlying PD page.
 			 */
 			pd = NULL;
 		}
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (pd != NULL && (*pd & PG_V) != 0) {
 		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 		m->ref_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va);
 		if (m == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (m);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap %p resident count %ld != 0",
 	    pmap, pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap %p has reserved page table page(s)",
 	    pmap));
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("releasing active pmap %p", pmap));
 
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop));
 
 	if (pmap_is_la57(pmap)) {
 		pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0;
 		pmap->pm_pmltop[PML5PML5I] = 0;
 	} else {
 		for (i = 0; i < NKPML4E; i++)	/* KVA */
 			pmap->pm_pmltop[KPML4BASE + i] = 0;
 #ifdef KASAN
 		for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */
 			pmap->pm_pmltop[KASANPML4I + i] = 0;
 #endif
 #ifdef KMSAN
 		for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */
 			pmap->pm_pmltop[KMSANSHADPML4I + i] = 0;
 		for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */
 			pmap->pm_pmltop[KMSANORIGPML4I + i] = 0;
 #endif
 		for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
 			pmap->pm_pmltop[DMPML4I + i] = 0;
 		pmap->pm_pmltop[PML4PML4I] = 0;	/* Recursive Mapping */
 		for (i = 0; i < lm_ents; i++)	/* Large Map */
 			pmap->pm_pmltop[LMSPML4I + i] = 0;
 	}
 
 	pmap_free_pt_page(NULL, m, true);
 
 	if (pmap->pm_pmltopu != NULL) {
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->
 		    pm_pmltopu));
 		pmap_free_pt_page(NULL, m, false);
 	}
 	if (pmap->pm_type == PT_X86 &&
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
 		rangeset_fini(&pmap->pm_pkru);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
     0, 0, kvm_size, "LU",
     "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
     0, 0, kvm_free, "LU",
     "Amount of KVM free");
 
 #ifdef KMSAN
 static void
 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_paddr_t dummypa, dummypd, dummypt;
 	int i, npde, npdpg;
 
 	npdpg = howmany(size, NBPDP);
 	npde = size / NBPDR;
 
 	dummypa = vm_phys_early_alloc(-1, PAGE_SIZE);
 	pagezero((void *)PHYS_TO_DMAP(dummypa));
 
 	dummypt = vm_phys_early_alloc(-1, PAGE_SIZE);
 	pagezero((void *)PHYS_TO_DMAP(dummypt));
 	dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg);
 	for (i = 0; i < npdpg; i++)
 		pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i)));
 
 	pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt);
 	for (i = 0; i < NPTEPG; i++)
 		pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW |
 		    X86_PG_A | X86_PG_M | pg_nx);
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd);
 	for (i = 0; i < npde; i++)
 		pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx);
 
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa);
 	for (i = 0; i < npdpg; i++)
 		pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V |
 		    X86_PG_RW | pg_nx);
 }
 
 static void
 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end)
 {
 	vm_size_t size;
 
 	KASSERT(start % NBPDP == 0, ("unaligned page array start address"));
 
 	/*
 	 * The end of the page array's KVA region is 2MB aligned, see
 	 * kmem_init().
 	 */
 	size = round_2mpage(end) - start;
 	pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size);
 	pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size);
 }
 #endif
 
 /*
  * Allocate physical memory for the vm_page array and map it into KVA,
  * attempting to back the vm_pages with domain-local memory.
  */
 void
 pmap_page_array_startup(long pages)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde, newpdir;
 	vm_offset_t va, start, end;
 	vm_paddr_t pa;
 	long pfn;
 	int domain, i;
 
 	vm_page_array_size = pages;
 
 	start = VM_MIN_KERNEL_ADDRESS;
 	end = start + pages * sizeof(struct vm_page);
 	for (va = start; va < end; va += NBPDR) {
 		pfn = first_page + (va - start) / sizeof(struct vm_page);
 		domain = vm_phys_domain(ptoa(pfn));
 		pdpe = pmap_pdpe(kernel_pmap, va);
 		if ((*pdpe & X86_PG_V) == 0) {
 			pa = vm_phys_early_alloc(domain, PAGE_SIZE);
 			dump_add_page(pa);
 			pagezero((void *)PHYS_TO_DMAP(pa));
 			*pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW |
 			    X86_PG_A | X86_PG_M);
 		}
 		pde = pmap_pdpe_to_pde(pdpe, va);
 		if ((*pde & X86_PG_V) != 0)
 			panic("Unexpected pde");
 		pa = vm_phys_early_alloc(domain, NBPDR);
 		for (i = 0; i < NPDEPG; i++)
 			dump_add_page(pa + i * PAGE_SIZE);
 		newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A |
 		    X86_PG_M | PG_PS | pg_g | pg_nx);
 		pde_store(pde, newpdir);
 	}
 	vm_page_array = (vm_page_t)start;
 
 #ifdef KMSAN
 	pmap_kmsan_page_array_startup(start, end);
 #endif
 }
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *pde, newpdir;
 	pdp_entry_t *pdpe;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	/*
 	 * Return if "addr" is within the range of kernel page table pages
 	 * that were preallocated during pmap bootstrap.  Moreover, leave
 	 * "kernel_vm_end" and the kernel page table as they were.
 	 *
 	 * The correctness of this action is based on the following
 	 * argument: vm_map_insert() allocates contiguous ranges of the
 	 * kernel virtual address space.  It calls this function if a range
 	 * ends after "kernel_vm_end".  If the kernel is mapped between
 	 * "kernel_vm_end" and "addr", then the range cannot begin at
 	 * "kernel_vm_end".  In fact, its beginning address cannot be less
 	 * than the kernel.  Thus, there is no immediate need to allocate
 	 * any new kernel page table pages between "kernel_vm_end" and
 	 * "KERNBASE".
 	 */
 	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
 		return;
 
 	addr = roundup2(addr, NBPDR);
 	if (addr - 1 >= vm_map_max(kernel_map))
 		addr = vm_map_max(kernel_map);
 	if (kernel_vm_end < addr)
 		kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
 	if (kernel_vm_end < addr)
 		kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
 	while (kernel_vm_end < addr) {
 		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
 		if ((*pdpe & X86_PG_V) == 0) {
 			/* We need a new PDP entry */
 			nkpg = pmap_alloc_pt_page(kernel_pmap,
 			    kernel_vm_end >> PDPSHIFT, VM_ALLOC_WIRED |
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
 			    X86_PG_A | X86_PG_M);
 			continue; /* try again */
 		}
 		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
 		if ((*pde & X86_PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 				kernel_vm_end = vm_map_max(kernel_map);
 				break;                       
 			}
 			continue;
 		}
 
 		nkpg = pmap_alloc_pt_page(kernel_pmap,
 		    pmap_pde_pindex(kernel_vm_end), VM_ALLOC_WIRED |
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 		pde_store(pde, newpdir);
 
 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 			kernel_vm_end = vm_map_max(kernel_map);
 			break;                       
 		}
 	}
 }
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #ifdef PV_STATS
 
 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD,
     &pc_chunk_count, "Current number of pv entry cnunks");
 
 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD,
     &pc_chunk_allocs, "Total number of pv entry chunks allocated");
 
 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD,
     &pc_chunk_frees, "Total number of pv entry chunks freed");
 
 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD,
     &pc_chunk_tryfail,
     "Number of failed attempts to get a pv entry chunk page");
 
 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD,
     &pv_entry_frees, "Total number of pv entries freed");
 
 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD,
     &pv_entry_allocs, "Total number of pv entries allocated");
 
 static COUNTER_U64_DEFINE_EARLY(pv_entry_count);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD,
     &pv_entry_count, "Current number of pv entries");
 
 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare);
 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD,
     &pv_entry_spare, "Current number of spare pv entries");
 #endif
 
 static void
 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
 {
 
 	if (pmap == NULL)
 		return;
 	pmap_invalidate_all(pmap);
 	if (pmap != locked_pmap)
 		PMAP_UNLOCK(pmap);
 	if (start_di)
 		pmap_delayed_invl_finish();
 }
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  *
  * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
 static vm_page_t
 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
 {
 	struct pv_chunks_list *pvc;
 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
 	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t next_pmap, pmap;
 	pt_entry_t *pte, tpte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint64_t inuse;
 	int bit, field, freed;
 	bool start_di, restart;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 	pmap = NULL;
 	m_pc = NULL;
 	PG_G = PG_A = PG_M = PG_RW = 0;
 	SLIST_INIT(&free);
 	bzero(&pc_marker_b, sizeof(pc_marker_b));
 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
 	pc_marker = (struct pv_chunk *)&pc_marker_b;
 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
 
 	/*
 	 * A delayed invalidation block should already be active if
 	 * pmap_advise() or pmap_remove() called this function by way
 	 * of pmap_demote_pde_locked().
 	 */
 	start_di = pmap_not_in_di();
 
 	pvc = &pv_chunks[domain];
 	mtx_lock(&pvc->pvc_lock);
 	pvc->active_reclaims++;
 	TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
 	    SLIST_EMPTY(&free)) {
 		next_pmap = pc->pc_pmap;
 		if (next_pmap == NULL) {
 			/*
 			 * The next chunk is a marker.  However, it is
 			 * not our marker, so active_reclaims must be
 			 * > 1.  Consequently, the next_chunk code
 			 * will not rotate the pv_chunks list.
 			 */
 			goto next_chunk;
 		}
 		mtx_unlock(&pvc->pvc_lock);
 
 		/*
 		 * A pv_chunk can only be removed from the pc_lru list
 		 * when both pc_chunks_mutex is owned and the
 		 * corresponding pmap is locked.
 		 */
 		if (pmap != next_pmap) {
 			restart = false;
 			reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
 			    start_di);
 			pmap = next_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap) {
 				RELEASE_PV_LIST_LOCK(lockp);
 				PMAP_LOCK(pmap);
 				if (start_di)
 					pmap_delayed_invl_start();
 				mtx_lock(&pvc->pvc_lock);
 				restart = true;
 			} else if (pmap != locked_pmap) {
 				if (PMAP_TRYLOCK(pmap)) {
 					if (start_di)
 						pmap_delayed_invl_start();
 					mtx_lock(&pvc->pvc_lock);
 					restart = true;
 				} else {
 					pmap = NULL; /* pmap is not locked */
 					mtx_lock(&pvc->pvc_lock);
 					pc = TAILQ_NEXT(pc_marker, pc_lru);
 					if (pc == NULL ||
 					    pc->pc_pmap != next_pmap)
 						continue;
 					goto next_chunk;
 				}
 			} else if (start_di)
 				pmap_delayed_invl_start();
 			PG_G = pmap_global_bit(pmap);
 			PG_A = pmap_accessed_bit(pmap);
 			PG_M = pmap_modified_bit(pmap);
 			PG_RW = pmap_rw_bit(pmap);
 			if (restart)
 				continue;
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = bsfq(inuse);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va = pv->pv_va;
 				pde = pmap_pde(pmap, va);
 				if ((*pde & PG_PS) != 0)
 					continue;
 				pte = pmap_pde_to_pte(pde, va);
 				if ((*pte & PG_W) != 0)
 					continue;
 				tpte = pte_load_clear(pte);
 				if ((tpte & PG_G) != 0)
 					pmap_invalidate_page(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 					vm_page_dirty(m);
 				if ((tpte & PG_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pmap_delayed_invl_page(m);
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, *pde, &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			mtx_lock(&pvc->pvc_lock);
 			goto next_chunk;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap_resident_count_adj(pmap, -freed);
 		PV_STAT(counter_u64_add(pv_entry_frees, freed));
 		PV_STAT(counter_u64_add(pv_entry_spare, freed));
 		PV_STAT(counter_u64_add(pv_entry_count, -freed));
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 		    pc->pc_map[2] == PC_FREE2) {
 			PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV));
 			PV_STAT(counter_u64_add(pc_chunk_count, -1));
 			PV_STAT(counter_u64_add(pc_chunk_frees, 1));
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 			dump_drop_page(m_pc->phys_addr);
 			mtx_lock(&pvc->pvc_lock);
 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
 			break;
 		}
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		mtx_lock(&pvc->pvc_lock);
 		/* One freed pv entry in locked_pmap is sufficient. */
 		if (pmap == locked_pmap)
 			break;
 next_chunk:
 		TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
 		TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
 		if (pvc->active_reclaims == 1 && pmap != NULL) {
 			/*
 			 * Rotate the pv chunks list so that we do not
 			 * scan the same pv chunks that could not be
 			 * freed (because they contained a wired
 			 * and/or superpage mapping) on every
 			 * invocation of reclaim_pv_chunk().
 			 */
 			while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) {
 				MPASS(pc->pc_pmap != NULL);
 				TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
 				TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
 			}
 		}
 	}
 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
 	pvc->active_reclaims--;
 	mtx_unlock(&pvc->pvc_lock);
 	reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->ref_count = 1;
 	}
 	vm_page_free_pages_toq(&free, true);
 	return (m_pc);
 }
 
 static vm_page_t
 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
 	vm_page_t m;
 	int i, domain;
 
 	domain = PCPU_GET(domain);
 	for (i = 0; i < vm_ndomains; i++) {
 		m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
 		if (m != NULL)
 			break;
 		domain = (domain + 1) % vm_ndomains;
 	}
 
 	return (m);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(counter_u64_add(pv_entry_frees, 1));
 	PV_STAT(counter_u64_add(pv_entry_spare, 1));
 	PV_STAT(counter_u64_add(pv_entry_count, -1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		}
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk_dequeued(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV));
 	PV_STAT(counter_u64_add(pc_chunk_count, -1));
 	PV_STAT(counter_u64_add(pc_chunk_frees, 1));
 	counter_u64_add(pv_page_count, -1);
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	struct pv_chunks_list *pvc;
 
 	pvc = &pv_chunks[pc_to_domain(pc)];
 	mtx_lock(&pvc->pvc_lock);
 	TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
 	mtx_unlock(&pvc->pvc_lock);
 	free_pv_chunk_dequeued(pc);
 }
 
 static void
 free_pv_chunk_batch(struct pv_chunklist *batch)
 {
 	struct pv_chunks_list *pvc;
 	struct pv_chunk *pc, *npc;
 	int i;
 
 	for (i = 0; i < vm_ndomains; i++) {
 		if (TAILQ_EMPTY(&batch[i]))
 			continue;
 		pvc = &pv_chunks[i];
 		mtx_lock(&pvc->pvc_lock);
 		TAILQ_FOREACH(pc, &batch[i], pc_list) {
 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
 		}
 		mtx_unlock(&pvc->pvc_lock);
 	}
 
 	for (i = 0; i < vm_ndomains; i++) {
 		TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
 			free_pv_chunk_dequeued(pc);
 		}
 	}
 }
 
 /*
  * Returns a new PV entry, allocating a new PV chunk from the system when
  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
  * returned.
  *
  * The given PV list lock may be released.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
 	struct pv_chunks_list *pvc;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(counter_u64_add(pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfq(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			PV_STAT(counter_u64_add(pv_entry_count, 1));
 			PV_STAT(counter_u64_add(pv_entry_spare, -1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (lockp == NULL) {
 			PV_STAT(counter_u64_add(pc_chunk_tryfail, 1));
 			return (NULL);
 		}
 		m = reclaim_pv_chunk(pmap, lockp);
 		if (m == NULL)
 			goto retry;
 	} else
 		counter_u64_add(pv_page_count, 1);
 	PV_STAT(counter_u64_add(pc_chunk_count, 1));
 	PV_STAT(counter_u64_add(pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	pvc = &pv_chunks[vm_page_domain(m)];
 	mtx_lock(&pvc->pvc_lock);
 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
 	mtx_unlock(&pvc->pvc_lock);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(counter_u64_add(pv_entry_count, 1));
 	PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
  * Returns the number of one bits within the given PV chunk map.
  *
  * The erratas for Intel processors state that "POPCNT Instruction May
  * Take Longer to Execute Than Expected".  It is believed that the
  * issue is the spurious dependency on the destination register.
  * Provide a hint to the register rename logic that the destination
  * value is overwritten, by clearing it, as suggested in the
  * optimization manual.  It should be cheap for unaffected processors
  * as well.
  *
  * Reference numbers for erratas are
  * 4th Gen Core: HSD146
  * 5th Gen Core: BDM85
  * 6th Gen Core: SKL029
  */
 static int
 popcnt_pc_map_pq(uint64_t *map)
 {
 	u_long result, tmp;
 
 	__asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
 	    "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
 	    "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
 	    : "=&r" (result), "=&r" (tmp)
 	    : "m" (map[0]), "m" (map[1]), "m" (map[2]));
 	return (result);
 }
 
 /*
  * Ensure that the number of spare PV entries in the specified pmap meets or
  * exceeds the given count, "needed".
  *
  * The given PV list lock may be released.
  */
 static void
 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 {
 	struct pv_chunks_list *pvc;
 	struct pch new_tail[PMAP_MEMDOM];
 	struct pv_chunk *pc;
 	vm_page_t m;
 	int avail, free, i;
 	bool reclaimed;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 
 	/*
 	 * Newly allocated PV chunks must be stored in a private list until
 	 * the required number of PV chunks have been allocated.  Otherwise,
 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
 	 * contrast, these chunks must be added to the pmap upon allocation.
 	 */
 	for (i = 0; i < PMAP_MEMDOM; i++)
 		TAILQ_INIT(&new_tail[i]);
 retry:
 	avail = 0;
 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 #ifndef __POPCNT__
 		if ((cpu_feature2 & CPUID2_POPCNT) == 0)
 			bit_count((bitstr_t *)pc->pc_map, 0,
 			    sizeof(pc->pc_map) * NBBY, &free);
 		else
 #endif
 		free = popcnt_pc_map_pq(pc->pc_map);
 		if (free == 0)
 			break;
 		avail += free;
 		if (avail >= needed)
 			break;
 	}
 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
 		if (m == NULL) {
 			m = reclaim_pv_chunk(pmap, lockp);
 			if (m == NULL)
 				goto retry;
 			reclaimed = true;
 		} else
 			counter_u64_add(pv_page_count, 1);
 		PV_STAT(counter_u64_add(pc_chunk_count, 1));
 		PV_STAT(counter_u64_add(pc_chunk_allocs, 1));
 		dump_add_page(m->phys_addr);
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
 		pc->pc_map[0] = PC_FREE0;
 		pc->pc_map[1] = PC_FREE1;
 		pc->pc_map[2] = PC_FREE2;
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
 		PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV));
 
 		/*
 		 * The reclaim might have freed a chunk from the current pmap.
 		 * If that chunk contained available entries, we need to
 		 * re-count the number of available entries.
 		 */
 		if (reclaimed)
 			goto retry;
 	}
 	for (i = 0; i < vm_ndomains; i++) {
 		if (TAILQ_EMPTY(&new_tail[i]))
 			continue;
 		pvc = &pv_chunks[i];
 		mtx_lock(&pvc->pvc_lock);
 		TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
 		mtx_unlock(&pvc->pvc_lock);
 	}
 }
 
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
  * 2MB page mappings.
  */
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 			break;
 		}
 	}
 	return (pv);
 }
 
 /*
  * After demotion from a 2MB page mapping to 512 4KB page mappings,
  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  * entries for each of the 4KB page mappings.
  */
 static void
 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 	int bit, field;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
 	 * page's pv list.  Once this transfer begins, the pv list lock
 	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	m->md.pv_gen++;
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
 	PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1));
 	va_last = va + NBPDR - PAGE_SIZE;
 	for (;;) {
 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
 		for (field = 0; field < _NPCM; field++) {
 			while (pc->pc_map[field]) {
 				bit = bsfq(pc->pc_map[field]);
 				pc->pc_map[field] &= ~(1ul << bit);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va += PAGE_SIZE;
 				pv->pv_va = va;
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_pde: page %p is not managed", m));
 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
 		}
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 out:
 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 	PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1));
 	PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1)));
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
  * replace the many pv entries for the 4KB page mappings by a single pv entry
  * for the 2MB page mapping.
  */
 static void
 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 	 * a transfer avoids the possibility that get_pv_entry() calls
 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 	 * mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  * First find and then destroy the pv entry for the specified pmap and virtual
  * address.  This operation can be performed on pv lists for either 4KB or 2MB
  * page mappings.
  */
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Conditionally create the PV entry for a 4KB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
  * false if the PV entry cannot be allocated without resorting to reclamation.
  */
 static bool
 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_paddr_t pa;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 	    NULL : lockp)) == NULL)
 		return (false);
 	pv->pv_va = va;
 	pa = pde & PG_PS_FRAME;
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	return (true);
 }
 
 /*
  * Fills a page table page with mappings to consecutive physical pages.
  */
 static void
 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 {
 	pt_entry_t *pte;
 
 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 		*pte = newpte;
 		newpte += PAGE_SIZE;
 	}
 }
 
 /*
  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
  * mapping is invalidated.
  */
 static boolean_t
 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	struct rwlock *lock;
 	boolean_t rv;
 
 	lock = NULL;
 	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	return (rv);
 }
 
 static void
 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused)
 {
 #ifdef INVARIANTS
 #ifdef DIAGNOSTIC
 	pt_entry_t *xpte, *ypte;
 
 	for (xpte = firstpte; xpte < firstpte + NPTEPG;
 	    xpte++, newpte += PAGE_SIZE) {
 		if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) {
 			printf("pmap_demote_pde: xpte %zd and newpte map "
 			    "different pages: found %#lx, expected %#lx\n",
 			    xpte - firstpte, *xpte, newpte);
 			printf("page table dump\n");
 			for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++)
 				printf("%zd %#lx\n", ypte - firstpte, *ypte);
 			panic("firstpte");
 		}
 	}
 #else
 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 	    ("pmap_demote_pde: firstpte and newpte map different physical"
 	    " addresses"));
 #endif
 #endif
 }
 
 static void
 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t oldpde, struct rwlock **lockp)
 {
 	struct spglist free;
 	vm_offset_t sva;
 
 	SLIST_INIT(&free);
 	sva = trunc_2mpage(va);
 	pmap_remove_pde(pmap, pde, sva, &free, lockp);
 	if ((oldpde & pmap_global_bit(pmap)) == 0)
 		pmap_invalidate_pde_page(pmap, sva, oldpde);
 	vm_page_free_pages_toq(&free, true);
 	CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p",
 	    va, pmap);
 }
 
 static boolean_t
 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
 	pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 	int PG_PTE_CACHE;
 	bool in_kernel;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	in_kernel = va >= VM_MAXUSER_ADDRESS;
 	oldpde = *pde;
 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 
 	/*
 	 * Invalidate the 2MB page mapping and return "failure" if the
 	 * mapping was never accessed.
 	 */
 	if ((oldpde & PG_A) == 0) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_pde: a wired mapping is missing PG_A"));
 		pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
 		return (FALSE);
 	}
 
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_pde: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * If the page table page is missing and the mapping
 		 * is for a kernel address, the mapping must belong to
 		 * the direct map.  Page table pages are preallocated
 		 * for every other part of the kernel address space,
 		 * so the direct map region is the only part of the
 		 * kernel address space that must be handled here.
 		 */
 		KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS &&
 		    va < DMAP_MAX_ADDRESS),
 		    ("pmap_demote_pde: No saved mpte for va %#lx", va));
 
 		/*
 		 * If the 2MB page mapping belongs to the direct map
 		 * region of the kernel's address space, then the page
 		 * allocation request specifies the highest possible
 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
 		 * priority is normal.
 		 */
 		mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va),
 		    (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED);
 
 		/*
 		 * If the allocation of the new page table page fails,
 		 * invalidate the 2MB page mapping and return "failure".
 		 */
 		if (mpte == NULL) {
 			pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
 			return (FALSE);
 		}
 
 		if (!in_kernel)
 			mpte->ref_count = NPTEPG;
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pde: oldpde is missing PG_M"));
 	newpte = oldpde & ~PG_PS;
 	newpte = pmap_swap_pat(pmap, newpte);
 
 	/*
 	 * If the page table page is not leftover from an earlier promotion,
 	 * initialize it.
 	 */
 	if (mpte->valid == 0)
 		pmap_fill_ptp(firstpte, newpte);
 
 	pmap_demote_pde_check(firstpte, newpte);
 
 	/*
 	 * If the mapping has changed attributes, update the page table
 	 * entries.
 	 */
 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 		pmap_fill_ptp(firstpte, newpte);
 
 	/*
 	 * The spare PV entries must be reserved prior to demoting the
 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
 	 * of the PDE and the PV lists will be inconsistent, which can result
 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
 	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
 	 * PV entry for the 2MB page mapping that is being demoted.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
 
 	/*
 	 * Demote the mapping.  This pmap is locked.  The old PDE has
 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 	 * set.  Thus, there is no danger of a race with another
 	 * processor changing the setting of PG_A and/or PG_M between
 	 * the read above and the store below. 
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else
 		pde_store(pde, newpde);
 
 	/*
 	 * Invalidate a stale recursive mapping of the page table page.
 	 */
 	if (in_kernel)
 		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 
 	/*
 	 * Demote the PV entry.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
 
 	counter_u64_add(pmap_pde_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p",
 	    va, pmap);
 	return (TRUE);
 }
 
 /*
  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
  */
 static void
 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL)
 		panic("pmap_remove_kernel_pde: Missing pt page.");
 
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
 
 	/*
 	 * If this page table page was unmapped by a promotion, then it
 	 * contains valid mappings.  Zero it to invalidate those mappings.
 	 */
 	if (mpte->valid != 0)
 		pagezero((void *)PHYS_TO_DMAP(mptepa));
 
 	/*
 	 * Demote the mapping.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else
 		pde_store(pde, newpde);
 
 	/*
 	 * Invalidate a stale recursive mapping of the page table page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 }
 
 /*
  * pmap_remove_pde: do the things to unmap a superpage in a process
  */
 static int
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m, mpte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_remove_pde: sva is not 2mpage aligned"));
 	oldpde = pte_load_clear(pdq);
 	if (oldpde & PG_W)
 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 	if ((oldpde & PG_G) != 0)
 		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 	pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE);
 	if (oldpde & PG_MANAGED) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(m);
 			if (oldpde & PG_A)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 			pmap_delayed_invl_page(m);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_pde(pmap, pdq, sva);
 	} else {
 		mpte = pmap_remove_pt_page(pmap, sva);
 		if (mpte != NULL) {
 			KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_pde: pte page not promoted"));
 			pmap_resident_count_adj(pmap, -1);
 			KASSERT(mpte->ref_count == NPTEPG,
 			    ("pmap_remove_pde: pte page ref count error"));
 			mpte->ref_count = 0;
 			pmap_add_delayed_free_list(mpte, free, FALSE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
 	vm_page_t m;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	pmap_resident_count_adj(pmap, -1);
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		pmap_pvh_free(&m->md, pmap, va);
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->flags & PG_FICTITIOUS) == 0) {
 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 		pmap_delayed_invl_page(m);
 	}
 	return (pmap_unuse_pt(pmap, va, ptepde, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     struct spglist *free)
 {
 	struct rwlock *lock;
 	pt_entry_t *pte, PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((*pde & PG_V) == 0)
 		return;
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		return;
 	lock = NULL;
 	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  * Removes the specified range of addresses from the page table page.
  */
 static bool
 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
 {
 	pt_entry_t PG_G, *pte;
 	vm_offset_t va;
 	bool anyvalid;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PG_G = pmap_global_bit(pmap);
 	anyvalid = false;
 	va = eva;
 	for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
 	    sva += PAGE_SIZE) {
 		if (*pte == 0) {
 			if (va != eva) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = eva;
 			}
 			continue;
 		}
 		if ((*pte & PG_G) == 0)
 			anyvalid = true;
 		else if (va == eva)
 			va = sva;
 		if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
 			sva += PAGE_SIZE;
 			break;
 		}
 	}
 	if (va != eva)
 		pmap_invalidate_range(pmap, va, sva);
 	return (anyvalid);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct rwlock *lock;
 	vm_page_t mt;
 	vm_offset_t va_next;
 	pml5_entry_t *pml5e;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t PG_G, PG_V;
 	struct spglist free;
 	int anyvalid;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 	SLIST_INIT(&free);
 
 	pmap_delayed_invl_start();
 	PMAP_LOCK(pmap);
 	pmap_pkru_on_remove(pmap, sva, eva);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		pde = pmap_pde(pmap, sva);
 		if (pde && (*pde & PG_PS) == 0) {
 			pmap_remove_page(pmap, sva, pde, &free);
 			goto out;
 		}
 	}
 
 	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		if (pmap_is_la57(pmap)) {
 			pml5e = pmap_pml5e(pmap, sva);
 			if ((*pml5e & PG_V) == 0) {
 				va_next = (sva + NBPML5) & ~PML5MASK;
 				if (va_next < sva)
 					va_next = eva;
 				continue;
 			}
 			pml4e = pmap_pml5e_to_pml4e(pml5e, sva);
 		} else {
 			pml4e = pmap_pml4e(pmap, sva);
 		}
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + NBPDP) & ~PDPMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0)
 			continue;
 		if ((*pdpe & PG_PS) != 0) {
 			KASSERT(va_next <= eva,
 			    ("partial update of non-transparent 1G mapping "
 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
 			    *pdpe, sva, eva, va_next));
 			MPASS(pmap != kernel_pmap); /* XXXKIB */
 			MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
 			anyvalid = 1;
 			*pdpe = 0;
 			pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE);
 			mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME);
 			pmap_unwire_ptp(pmap, sva, mt, &free);
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we removing the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_remove_pde().
 				 */
 				if ((ptpaddr & PG_G) == 0)
 					anyvalid = 1;
 				pmap_remove_pde(pmap, pde, sva, &free, &lock);
 				continue;
 			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
 			    &lock)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			} else
 				ptpaddr = *pde;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
 			anyvalid = 1;
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 out:
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	pmap_delayed_invl_finish();
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	struct spglist free;
 	int pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	rw_wlock(lock);
 retry:
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 		PMAP_UNLOCK(pmap);
 	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pmap_resident_count_adj(pmap, -1);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(lock);
 	pmap_delayed_invl_wait(m);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * pmap_protect_pde: do the things to protect a 2mpage in a process
  */
 static boolean_t
 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 {
 	pd_entry_t newpde, oldpde;
 	vm_page_t m, mt;
 	boolean_t anychanged;
 	pt_entry_t PG_G, PG_M, PG_RW;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_protect_pde: sva is not 2mpage aligned"));
 	anychanged = FALSE;
 retry:
 	oldpde = newpde = *pde;
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 		    (PG_MANAGED | PG_M | PG_RW)) {
 			m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 				vm_page_dirty(mt);
 		}
 		newpde &= ~(PG_RW | PG_M);
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 	if (newpde != oldpde) {
 		/*
 		 * As an optimization to future operations on this PDE, clear
 		 * PG_PROMOTED.  The impending invalidation will remove any
 		 * lingering 4KB page mappings from the TLB.
 		 */
 		if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
 			goto retry;
 		if ((oldpde & PG_G) != 0)
 			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 		else
 			anychanged = TRUE;
 	}
 	return (anychanged);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_page_t m;
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
 	pt_entry_t obits, pbits;
 	boolean_t anychanged;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	anychanged = FALSE;
 
 	/*
 	 * Although this function delays and batches the invalidation
 	 * of stale TLB entries, it does not need to call
 	 * pmap_delayed_invl_start() and
 	 * pmap_delayed_invl_finish(), because it does not
 	 * ordinarily destroy mappings.  Stale TLB entries from
 	 * protection-only changes need only be invalidated before the
 	 * pmap lock is released, because protection-only changes do
 	 * not destroy PV entries.  Even operations that iterate over
 	 * a physical page's PV list of mappings, like
 	 * pmap_remove_write(), acquire the pmap lock for each
 	 * mapping.  Consequently, for protection-only changes, the
 	 * pmap lock suffices to synchronize both page table and TLB
 	 * updates.
 	 *
 	 * This function only destroys a mapping if pmap_demote_pde()
 	 * fails.  In that case, stale TLB entries are immediately
 	 * invalidated.
 	 */
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		pml4e = pmap_pml4e(pmap, sva);
 		if (pml4e == NULL || (*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + NBPDP) & ~PDPMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0)
 			continue;
 		if ((*pdpe & PG_PS) != 0) {
 			KASSERT(va_next <= eva,
 			    ("partial update of non-transparent 1G mapping "
 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
 			    *pdpe, sva, eva, va_next));
 retry_pdpe:
 			obits = pbits = *pdpe;
 			MPASS((pbits & (PG_MANAGED | PG_G)) == 0);
 			MPASS(pmap != kernel_pmap); /* XXXKIB */
 			if ((prot & VM_PROT_WRITE) == 0)
 				pbits &= ~(PG_RW | PG_M);
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 
 			if (pbits != obits) {
 				if (!atomic_cmpset_long(pdpe, obits, pbits))
 					/* PG_PS cannot be cleared under us, */
 					goto retry_pdpe;
 				anychanged = TRUE;
 			}
 			continue;
 		}
 
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we protecting the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_protect_pde().
 				 */
 				if (pmap_protect_pde(pmap, pde, sva, prot))
 					anychanged = TRUE;
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, sva)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 retry:
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 
 			if ((prot & VM_PROT_WRITE) == 0) {
 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 				    (PG_MANAGED | PG_M | PG_RW)) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				pbits &= ~(PG_RW | PG_M);
 			}
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 
 			if (pbits != obits) {
 				if (!atomic_cmpset_long(pte, obits, pbits))
 					goto retry;
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = TRUE;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 #if VM_NRESERVLEVEL > 0
 static bool
 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde)
 {
 
 	if (pmap->pm_type != PT_EPT)
 		return (false);
 	return ((pde & EPT_PG_EXECUTE) != 0);
 }
 
 /*
  * Tries to promote the 512, contiguous 4KB page mappings that are within a
  * single page table page (PTP) to a single 2MB page mapping.  For promotion
  * to occur, two conditions must be met: (1) the 4KB page mappings must map
  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
  * identical characteristics. 
  */
 static void
 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK;
 	vm_page_t mpte;
 	int PG_PTE_CACHE;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
 	 * either invalid, unused, or does not map the first 4KB physical page
 	 * within a 2MB page. 
 	 */
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 	newpde = *firstpte;
 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) ||
 	    !pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap,
 	    newpde))) {
 		counter_u64_add(pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 setpde:
 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 		/*
 		 * When PG_M is already clear, PG_RW can be cleared without
 		 * a TLB invalidation.
 		 */
 		if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW))
 			goto setpde;
 		newpde &= ~PG_RW;
 	}
 
 	/*
 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
 	 * PTE maps an unexpected 4KB physical page or does not have identical
 	 * characteristics to the first PTE.
 	 */
 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 		oldpte = *pte;
 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 			counter_u64_add(pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 setpte:
 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 			/*
 			 * When PG_M is already clear, PG_RW can be cleared
 			 * without a TLB invalidation.
 			 */
 			if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW))
 				goto setpte;
 			oldpte &= ~PG_RW;
 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
 			    " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
 			    (va & ~PDRMASK), pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 			counter_u64_add(pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the PDE
 	 * mapping the superpage is demoted by pmap_demote_pde() or
 	 * destroyed by pmap_remove_pde(). 
 	 */
 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_pde: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_pde_pindex(va),
 	    ("pmap_promote_pde: page table page's pindex is wrong "
 	    "mpte %p pidx %#lx va %#lx va pde pidx %#lx",
 	    mpte, mpte->pindex, va, pmap_pde_pindex(va)));
 	if (pmap_insert_pt_page(pmap, mpte, true)) {
 		counter_u64_add(pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP,
 		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
 		    pmap);
 		return;
 	}
 
 	/*
 	 * Promote the pv entries.
 	 */
 	if ((newpde & PG_MANAGED) != 0)
 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
 
 	/*
 	 * Propagate the PAT index to its proper position.
 	 */
 	newpde = pmap_swap_pat(pmap, newpde);
 
 	/*
 	 * Map the superpage.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 	else
 		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
 
 	counter_u64_add(pmap_pde_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 static int
 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
     int psind)
 {
 	vm_page_t mp;
 	pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0,
 	    ("psind %d unexpected", psind));
 	KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0,
 	    ("unaligned phys address %#lx newpte %#lx psind %d",
 	    newpte & PG_FRAME, newpte, psind));
 	KASSERT((va & (pagesizes[psind] - 1)) == 0,
 	    ("unaligned va %#lx psind %d", va, psind));
 	KASSERT(va < VM_MAXUSER_ADDRESS,
 	    ("kernel mode non-transparent superpage")); /* XXXKIB */
 	KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS,
 	    ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */
 
 	PG_V = pmap_valid_bit(pmap);
 
 restart:
 	if (!pmap_pkru_same(pmap, va, va + pagesizes[psind]))
 		return (KERN_PROTECTION_FAILURE);
 	pten = newpte;
 	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
 		pten |= pmap_pkru_get(pmap, va);
 
 	if (psind == 2) {	/* 1G */
 		pml4e = pmap_pml4e(pmap, va);
 		if (pml4e == NULL || (*pml4e & PG_V) == 0) {
 			mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va),
 			    NULL, va);
 			if (mp == NULL)
 				goto allocf;
 			pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
 			pdpe = &pdpe[pmap_pdpe_index(va)];
 			origpte = *pdpe;
 			MPASS(origpte == 0);
 		} else {
 			pdpe = pmap_pml4e_to_pdpe(pml4e, va);
 			KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va));
 			origpte = *pdpe;
 			if ((origpte & PG_V) == 0) {
 				mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME);
 				mp->ref_count++;
 			}
 		}
 		*pdpe = pten;
 	} else /* (psind == 1) */ {	/* 2M */
 		pde = pmap_pde(pmap, va);
 		if (pde == NULL) {
 			mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va),
 			    NULL, va);
 			if (mp == NULL)
 				goto allocf;
 			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
 			pde = &pde[pmap_pde_index(va)];
 			origpte = *pde;
 			MPASS(origpte == 0);
 		} else {
 			origpte = *pde;
 			if ((origpte & PG_V) == 0) {
 				pdpe = pmap_pdpe(pmap, va);
 				MPASS(pdpe != NULL && (*pdpe & PG_V) != 0);
 				mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 				mp->ref_count++;
 			}
 		}
 		*pde = pten;
 	}
 	KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 &&
 	    (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)),
 	    ("va %#lx changing %s phys page origpte %#lx pten %#lx",
 	    va, psind == 2 ? "1G" : "2M", origpte, pten));
 	if ((pten & PG_W) != 0 && (origpte & PG_W) == 0)
 		pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
 	else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0)
 		pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
 	if ((origpte & PG_V) == 0)
 		pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE);
 
 	return (KERN_SUCCESS);
 
 allocf:
 	if ((flags & PMAP_ENTER_NOSLEEP) != 0)
 		return (KERN_RESOURCE_SHORTAGE);
 	PMAP_UNLOCK(pmap);
 	vm_wait(NULL);
 	PMAP_LOCK(pmap);
 	goto restart;
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  *
  *	When destroying both a page table and PV entry, this function
  *	performs the TLB invalidation before releasing the PV list
  *	lock, so we do not need pmap_delayed_invl_page() calls here.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	struct rwlock *lock;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
 	pt_entry_t newpte, origpte;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	int rv;
 	boolean_t nosleep;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	va = trunc_page(va);
 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 	    va));
 	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va),
 	    ("pmap_enter: managed mapping within the clean submap"));
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
 	KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
 	    ("pmap_enter: flags %u has reserved bits set", flags));
 	pa = VM_PAGE_TO_PHYS(m);
 	newpte = (pt_entry_t)(pa | PG_A | PG_V);
 	if ((flags & VM_PROT_WRITE) != 0)
 		newpte |= PG_M;
 	if ((prot & VM_PROT_WRITE) != 0)
 		newpte |= PG_RW;
 	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= PG_G;
 	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
 
 	/*
 	 * Set modified bit gratuitously for writeable mappings if
 	 * the page is unmanaged. We do not want to take a fault
 	 * to do the dirty bit accounting for these mappings.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) != 0) {
 		if ((newpte & PG_RW) != 0)
 			newpte |= PG_M;
 	} else
 		newpte |= PG_MANAGED;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
 		    ("managed largepage va %#lx flags %#x", va, flags));
 		rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags,
 		    psind);
 		goto out;
 	}
 	if (psind == 1) {
 		/* Assert the required virtual and physical alignment. */ 
 		KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 		rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
 		goto out;
 	}
 	mpte = NULL;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 retry:
 	pde = pmap_pde(pmap, va);
 	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
 	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
 		pte = pmap_pde_to_pte(pde, va);
 		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 			mpte->ref_count++;
 		}
 	} else if (va < VM_MAXUSER_ADDRESS) {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 		mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va),
 		    nosleep ? NULL : &lock, va);
 		if (mpte == NULL && nosleep) {
 			rv = KERN_RESOURCE_SHORTAGE;
 			goto out;
 		}
 		goto retry;
 	} else
 		panic("pmap_enter: invalid page directory va=%#lx", va);
 
 	origpte = *pte;
 	pv = NULL;
 	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
 		newpte |= pmap_pkru_get(pmap, va);
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if ((origpte & PG_V) != 0) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->ref_count--;
 			KASSERT(mpte->ref_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		opa = origpte & PG_FRAME;
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((origpte & PG_MANAGED) != 0 &&
 			    (newpte & PG_RW) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 				goto unchanged;
 			goto validate;
 		}
 
 		/*
 		 * The physical page has changed.  Temporarily invalidate
 		 * the mapping.  This ensures that all threads sharing the
 		 * pmap keep a consistent view of the mapping, which is
 		 * necessary for the correct handling of COW faults.  It
 		 * also permits reuse of the old mapping's PV entry,
 		 * avoiding an allocation.
 		 *
 		 * For consistency, handle unmanaged mappings the same way.
 		 */
 		origpte = pte_load_clear(pte);
 		KASSERT((origpte & PG_FRAME) == opa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((origpte & PG_MANAGED) != 0) {
 			om = PHYS_TO_VM_PAGE(opa);
 
 			/*
 			 * The pmap lock is sufficient to synchronize with
 			 * concurrent calls to pmap_page_test_mappings() and
 			 * pmap_ts_referenced().
 			 */
 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(om);
 			if ((origpte & PG_A) != 0) {
 				pmap_invalidate_page(pmap, va);
 				vm_page_aflag_set(om, PGA_REFERENCED);
 			}
 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 			KASSERT(pv != NULL,
 			    ("pmap_enter: no PV entry for %#lx", va));
 			if ((newpte & PG_MANAGED) == 0)
 				free_pv_entry(pmap, pv);
 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		} else {
 			/*
 			 * Since this mapping is unmanaged, assume that PG_A
 			 * is set.
 			 */
 			pmap_invalidate_page(pmap, va);
 		}
 		origpte = 0;
 	} else {
 		/*
 		 * Increment the counters.
 		 */
 		if ((newpte & PG_W) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap_resident_count_adj(pmap, 1);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((newpte & PG_MANAGED) != 0) {
 		if (pv == NULL) {
 			pv = get_pv_entry(pmap, &lock);
 			pv->pv_va = va;
 		}
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if ((newpte & PG_RW) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Update the PTE.
 	 */
 	if ((origpte & PG_V) != 0) {
 validate:
 		origpte = pte_load_store(pte, newpte);
 		KASSERT((origpte & PG_FRAME) == pa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
 		    (PG_M | PG_RW)) {
 			if ((origpte & PG_MANAGED) != 0)
 				vm_page_dirty(m);
 
 			/*
 			 * Although the PTE may still have PG_RW set, TLB
 			 * invalidation may nonetheless be required because
 			 * the PTE no longer has PG_M set.
 			 */
 		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
 			/*
 			 * This PTE change does not require TLB invalidation.
 			 */
 			goto unchanged;
 		}
 		if ((origpte & PG_A) != 0)
 			pmap_invalidate_page(pmap, va);
 	} else
 		pte_store(pte, newpte);
 
 unchanged:
 
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * If both the page table page and the reservation are fully
 	 * populated, then attempt promotion.
 	 */
 	if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_pde(pmap, pde, va, &lock);
 #endif
 
 	rv = KERN_SUCCESS;
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
  * if successful.  Returns false if (1) a page table page cannot be allocated
  * without sleeping, (2) a mapping already exists at the specified virtual
  * address, or (3) a PV entry cannot be allocated without reclaiming another
  * PV entry.
  */
 static bool
 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde;
 	pt_entry_t PG_V;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PG_V = pmap_valid_bit(pmap);
 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 	    PG_PS | PG_V;
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newpde |= PG_MANAGED;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpde |= PG_U;
 	return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 	    KERN_SUCCESS);
 }
 
 /*
  * Returns true if every page table entry in the specified page table page is
  * zero.
  */
 static bool
 pmap_every_pte_zero(vm_paddr_t pa)
 {
 	pt_entry_t *pt_end, *pte;
 
 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
 	for (pt_end = pte + NPTEPG; pte < pt_end; pte++) {
 		if (*pte != 0)
 			return (false);
 	}
 	return (true);
 }
 
 /*
  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
  * a mapping already exists at the specified virtual address.  Returns
  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  *
  * The parameter "m" is only used when creating a managed, writeable mapping.
  */
 static int
 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
     vm_page_t m, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t PG_G, PG_RW, PG_V;
 	vm_page_t mt, pdpg;
 
 	KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0,
 	    ("pmap_enter_pde: cannot create wired user mapping"));
 	PG_G = pmap_global_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
 	    ("pmap_enter_pde: newpde is missing PG_M"));
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap,
 	    newpde))) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (KERN_FAILURE);
 	}
 	if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags &
 	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 
 	/*
 	 * If pkru is not same for the whole pde range, return failure
 	 * and let vm_fault() cope.  Check after pde allocation, since
 	 * it could sleep.
 	 */
 	if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
 		pmap_abort_ptp(pmap, va, pdpg);
 		return (KERN_PROTECTION_FAILURE);
 	}
 	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) {
 		newpde &= ~X86_PG_PKU_MASK;
 		newpde |= pmap_pkru_get(pmap, va);
 	}
 
 	/*
 	 * If there are existing mappings, either abort or remove them.
 	 */
 	oldpde = *pde;
 	if ((oldpde & PG_V) != 0) {
 		KASSERT(pdpg == NULL || pdpg->ref_count > 1,
 		    ("pmap_enter_pde: pdpg's reference count is too low"));
 		if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (va <
 		    VM_MAXUSER_ADDRESS || (oldpde & PG_PS) != 0 ||
 		    !pmap_every_pte_zero(oldpde & PG_FRAME))) {
 			if (pdpg != NULL)
 				pdpg->ref_count--;
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (KERN_FAILURE);
 		}
 		/* Break the existing mapping(s). */
 		SLIST_INIT(&free);
 		if ((oldpde & PG_PS) != 0) {
 			/*
 			 * The reference to the PD page that was acquired by
 			 * pmap_alloc_pde() ensures that it won't be freed.
 			 * However, if the PDE resulted from a promotion, then
 			 * a reserved PT page could be freed.
 			 */
 			(void)pmap_remove_pde(pmap, pde, va, &free, lockp);
 			if ((oldpde & PG_G) == 0)
 				pmap_invalidate_pde_page(pmap, va, oldpde);
 		} else {
 			pmap_delayed_invl_start();
 			if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
 			    lockp))
 		               pmap_invalidate_all(pmap);
 			pmap_delayed_invl_finish();
 		}
 		if (va < VM_MAXUSER_ADDRESS) {
 			vm_page_free_pages_toq(&free, true);
 			KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
 			    pde));
 		} else {
 			KASSERT(SLIST_EMPTY(&free),
 			    ("pmap_enter_pde: freed kernel page table page"));
 
 			/*
 			 * Both pmap_remove_pde() and pmap_remove_ptes() will
 			 * leave the kernel page table page zero filled.
 			 */
 			mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 			if (pmap_insert_pt_page(pmap, mt, false))
 				panic("pmap_enter_pde: trie insert failed");
 		}
 	}
 
 	if ((newpde & PG_MANAGED) != 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
 			if (pdpg != NULL)
 				pmap_abort_ptp(pmap, va, pdpg);
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		if ((newpde & PG_RW) != 0) {
 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 				vm_page_aflag_set(mt, PGA_WRITEABLE);
 		}
 	}
 
 	/*
 	 * Increment counters.
 	 */
 	if ((newpde & PG_W) != 0)
 		pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
 	pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE);
 
 	/*
 	 * Map the superpage.  (This is not a promoted mapping; there will not
 	 * be any lingering 4KB page mappings in the TLB.)
 	 */
 	pde_store(pde, newpde);
 
 	counter_u64_add(pmap_pde_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p",
 	    va, pmap);
 	return (KERN_SUCCESS);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
 			m = &m[NBPDR / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 			    mpte, &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	struct rwlock *lock;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	pt_entry_t newpte, *pte, PG_V;
 
 	KASSERT(!VA_IS_CLEANMAP(va) ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t ptepindex;
 		pd_entry_t *ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = pmap_pde_pindex(va);
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->ref_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap_pde(pmap, va);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.  Otherwise, we
 			 * attempt to allocate a page table page.  If this
 			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (ptepa && (*ptepa & PG_V) != 0) {
 				if (*ptepa & PG_PS)
 					return (NULL);
 				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 				mpte->ref_count++;
 			} else {
 				/*
 				 * Pass NULL instead of the PV list lock
 				 * pointer, because we don't intend to sleep.
 				 */
 				mpte = pmap_allocpte_alloc(pmap, ptepindex,
 				    NULL, va);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 		pte = &pte[pmap_pte_index(va)];
 	} else {
 		mpte = NULL;
 		pte = vtopte(va);
 	}
 	if (*pte) {
 		if (mpte != NULL)
 			mpte->ref_count--;
 		return (NULL);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL)
 			pmap_abort_ptp(pmap, va, mpte);
 		return (NULL);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap_resident_count_adj(pmap, 1);
 
 	newpte = VM_PAGE_TO_PHYS(m) | PG_V |
 	    pmap_cache_bits(pmap, m->md.pat_mode, 0);
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newpte |= PG_MANAGED;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U | pmap_pkru_get(pmap, va);
 	pte_store(pte, newpte);
 	return (mpte);
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 	pd_entry_t *pde;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pa, ptepa;
 	vm_page_t p, pdpg;
 	int pat_mode;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 		if (!pmap_ps_enabled(pmap))
 			return;
 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
 			return;
 		p = vm_page_lookup(object, pindex);
 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
 		    ("pmap_object_init_pt: invalid page %p", p));
 		pat_mode = p->md.pat_mode;
 
 		/*
 		 * Abort the mapping if the first page is not physically
 		 * aligned to a 2MB page boundary.
 		 */
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		/*
 		 * Skip the first page.  Abort the mapping if the rest of
 		 * the pages are not physically contiguous or have differing
 		 * memory attributes.
 		 */
 		p = TAILQ_NEXT(p, listq);
 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 		    pa += PAGE_SIZE) {
 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_object_init_pt: invalid page %p", p));
 			if (pa != VM_PAGE_TO_PHYS(p) ||
 			    pat_mode != p->md.pat_mode)
 				return;
 			p = TAILQ_NEXT(p, listq);
 		}
 
 		/*
 		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
 		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
 		 * will not affect the termination of this loop.
 		 */ 
 		PMAP_LOCK(pmap);
 		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
 		    pa < ptepa + size; pa += NBPDR) {
 			pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL);
 			if (pde == NULL) {
 				/*
 				 * The creation of mappings below is only an
 				 * optimization.  If a page directory page
 				 * cannot be allocated without blocking,
 				 * continue on to the next mapping rather than
 				 * blocking.
 				 */
 				addr += NBPDR;
 				continue;
 			}
 			if ((*pde & PG_V) == 0) {
 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE);
 				counter_u64_add(pmap_pde_mappings, 1);
 			} else {
 				/* Continue on if the PDE is already valid. */
 				pdpg->ref_count--;
 				KASSERT(pdpg->ref_count > 0,
 				    ("pmap_object_init_pt: missing reference "
 				    "to page directory page, va: 0x%lx", addr));
 			}
 			addr += NBPDR;
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware
  *	feature, so there is no need to invalidate any TLB entries.
  *	Since pmap_demote_pde() for the wired entry must never fail,
  *	pmap_delayed_invl_start()/finish() calls around the
  *	function are not needed.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V, PG_G;
 
 	PG_V = pmap_valid_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		pml4e = pmap_pml4e(pmap, sva);
 		if (pml4e == NULL || (*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + NBPDP) & ~PDPMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0)
 			continue;
 		if ((*pdpe & PG_PS) != 0) {
 			KASSERT(va_next <= eva,
 			    ("partial update of non-transparent 1G mapping "
 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
 			    *pdpe, sva, eva, va_next));
 			MPASS(pmap != kernel_pmap); /* XXXKIB */
 			MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
 			atomic_clear_long(pdpe, PG_W);
 			pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE;
 			continue;
 		}
 
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		if ((*pde & PG_V) == 0)
 			continue;
 		if ((*pde & PG_PS) != 0) {
 			if ((*pde & PG_W) == 0)
 				panic("pmap_unwire: pde %#jx is missing PG_W",
 				    (uintmax_t)*pde);
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				atomic_clear_long(pde, PG_W);
 				pmap->pm_stats.wired_count -= NBPDR /
 				    PAGE_SIZE;
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, sva))
 				panic("pmap_unwire: demotion failed");
 		}
 		if (va_next > eva)
 			va_next = eva;
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & PG_V) == 0)
 				continue;
 			if ((*pte & PG_W) == 0)
 				panic("pmap_unwire: pte %#jx is missing PG_W",
 				    (uintmax_t)*pte);
 
 			/*
 			 * PG_W must be cleared atomically.  Although the pmap
 			 * lock synchronizes access to PG_W, another processor
 			 * could be setting PG_M and/or PG_A concurrently.
 			 */
 			atomic_clear_long(pte, PG_W);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 	struct rwlock *lock;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde, srcptepaddr;
 	pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte;
 	vm_offset_t addr, end_addr, va_next;
 	vm_page_t dst_pdpg, dstmpte, srcmpte;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (dst_pmap->pm_type != src_pmap->pm_type)
 		return;
 
 	/*
 	 * EPT page table entries that require emulation of A/D bits are
 	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
 	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
 	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
 	 * implementations flag an EPT misconfiguration for exec-only
 	 * mappings we skip this function entirely for emulated pmaps.
 	 */
 	if (pmap_emulate_ad_bits(dst_pmap))
 		return;
 
 	end_addr = src_addr + len;
 	lock = NULL;
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 
 	PG_A = pmap_accessed_bit(dst_pmap);
 	PG_M = pmap_modified_bit(dst_pmap);
 	PG_V = pmap_valid_bit(dst_pmap);
 
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		KASSERT(addr < UPT_MIN_ADDRESS,
 		    ("pmap_copy: invalid to pmap_copy page tables"));
 
 		pml4e = pmap_pml4e(src_pmap, addr);
 		if (pml4e == NULL || (*pml4e & PG_V) == 0) {
 			va_next = (addr + NBPML4) & ~PML4MASK;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 
 		va_next = (addr + NBPDP) & ~PDPMASK;
 		if (va_next < addr)
 			va_next = end_addr;
 		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 		if ((*pdpe & PG_V) == 0)
 			continue;
 		if ((*pdpe & PG_PS) != 0) {
 			KASSERT(va_next <= end_addr,
 			    ("partial update of non-transparent 1G mapping "
 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
 			    *pdpe, addr, end_addr, va_next));
 			MPASS((addr & PDPMASK) == 0);
 			MPASS((*pdpe & PG_MANAGED) == 0);
 			srcptepaddr = *pdpe;
 			pdpe = pmap_pdpe(dst_pmap, addr);
 			if (pdpe == NULL) {
 				if (pmap_allocpte_alloc(dst_pmap,
 				    pmap_pml4e_pindex(addr), NULL, addr) ==
 				    NULL)
 					break;
 				pdpe = pmap_pdpe(dst_pmap, addr);
 			} else {
 				pml4e = pmap_pml4e(dst_pmap, addr);
 				dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME);
 				dst_pdpg->ref_count++;
 			}
 			KASSERT(*pdpe == 0,
 			    ("1G mapping present in dst pmap "
 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
 			    *pdpe, addr, end_addr, va_next));
 			*pdpe = srcptepaddr & ~PG_W;
 			pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE);
 			continue;
 		}
 
 		va_next = (addr + NBPDR) & ~PDRMASK;
 		if (va_next < addr)
 			va_next = end_addr;
 
 		pde = pmap_pdpe_to_pde(pdpe, addr);
 		srcptepaddr = *pde;
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			/*
 			 * We can only virtual copy whole superpages.
 			 */
 			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 				continue;
 			pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL);
 			if (pde == NULL)
 				break;
 			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
 			    PMAP_ENTER_NORECLAIM, &lock))) {
 				/*
 				 * We leave the dirty bit unchanged because
 				 * managed read/write superpage mappings are
 				 * required to be dirty.  However, managed
 				 * superpage mappings are not required to
 				 * have their accessed bit set, so we clear
 				 * it because we don't know if this mapping
 				 * will be used.
 				 */
 				srcptepaddr &= ~PG_W;
 				if ((srcptepaddr & PG_MANAGED) != 0)
 					srcptepaddr &= ~PG_A;
 				*pde = srcptepaddr;
 				pmap_resident_count_adj(dst_pmap, NBPDR /
 				    PAGE_SIZE);
 				counter_u64_add(pmap_pde_mappings, 1);
 			} else
 				pmap_abort_ptp(dst_pmap, addr, dst_pdpg);
 			continue;
 		}
 
 		srcptepaddr &= PG_FRAME;
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 		KASSERT(srcmpte->ref_count > 0,
 		    ("pmap_copy: source page table page is unused"));
 
 		if (va_next > end_addr)
 			va_next = end_addr;
 
 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 		src_pte = &src_pte[pmap_pte_index(addr)];
 		dstmpte = NULL;
 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
 			ptetemp = *src_pte;
 
 			/*
 			 * We only virtual copy managed pages.
 			 */
 			if ((ptetemp & PG_MANAGED) == 0)
 				continue;
 
 			if (dstmpte != NULL) {
 				KASSERT(dstmpte->pindex ==
 				    pmap_pde_pindex(addr),
 				    ("dstmpte pindex/addr mismatch"));
 				dstmpte->ref_count++;
 			} else if ((dstmpte = pmap_allocpte(dst_pmap, addr,
 			    NULL)) == NULL)
 				goto out;
 			dst_pte = (pt_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 			dst_pte = &dst_pte[pmap_pte_index(addr)];
 			if (*dst_pte == 0 &&
 			    pmap_try_insert_pv_entry(dst_pmap, addr,
 			    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) {
 				/*
 				 * Clear the wired, modified, and accessed
 				 * (referenced) bits during the copy.
 				 */
 				*dst_pte = ptetemp & ~(PG_W | PG_M | PG_A);
 				pmap_resident_count_adj(dst_pmap, 1);
 			} else {
 				pmap_abort_ptp(dst_pmap, addr, dstmpte);
 				goto out;
 			}
 			/* Have we copied all of the valid mappings? */ 
 			if (dstmpte->ref_count >= srcmpte->ref_count)
 				break;
 		}
 	}
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 int
 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
 {
 	int error;
 
 	if (dst_pmap->pm_type != src_pmap->pm_type ||
 	    dst_pmap->pm_type != PT_X86 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
 		return (0);
 	for (;;) {
 		if (dst_pmap < src_pmap) {
 			PMAP_LOCK(dst_pmap);
 			PMAP_LOCK(src_pmap);
 		} else {
 			PMAP_LOCK(src_pmap);
 			PMAP_LOCK(dst_pmap);
 		}
 		error = pmap_pkru_copy(dst_pmap, src_pmap);
 		/* Clean up partial copy on failure due to no memory. */
 		if (error == ENOMEM)
 			pmap_pkru_deassign_all(dst_pmap);
 		PMAP_UNLOCK(src_pmap);
 		PMAP_UNLOCK(dst_pmap);
 		if (error != ENOMEM)
 			break;
 		vm_wait(NULL);
 	}
 	return (error);
 }
 
 /*
  * Zero the specified hardware page.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  * Zero an an area within a single hardware page.  off and size must not
  * cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  * Copy 1 specified hardware page to another.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
 	vm_page_t pages[2];
 	vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
 	int cnt;
 	boolean_t mapped;
 
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
 		pages[0] = ma[a_offset >> PAGE_SHIFT];
 		b_pg_offset = b_offset & PAGE_MASK;
 		pages[1] = mb[b_offset >> PAGE_SHIFT];
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
 		a_cp = (char *)vaddr[0] + a_pg_offset;
 		b_cp = (char *)vaddr[1] + b_pg_offset;
 		bcopy(a_cp, b_cp, cnt);
 		if (__predict_false(mapped))
 			pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	struct rwlock *lock;
 	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 	int count, md_gen, pvh_gen;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (0);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	count = 0;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		if ((*pte & PG_W) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pde(pmap, pv->pv_va);
 			if ((*pte & PG_W) != 0)
 				count++;
 			PMAP_UNLOCK(pmap);
 		}
 	}
 	rw_runlock(lock);
 	return (count);
 }
 
 /*
  * Returns TRUE if the given page is mapped individually or as part of
  * a 2mpage.  Otherwise, returns FALSE.
  */
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
 	struct rwlock *lock;
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  * Destroy all managed, non-wired mappings in the given user-space
  * pmap.  This pmap cannot be active on any processor besides the
  * caller.
  *
  * This function cannot be applied to the kernel pmap.  Moreover, it
  * is not intended for general use.  It is only to be used during
  * process termination.  Consequently, it can be implemented in ways
  * that make it faster than pmap_remove().  First, it can more quickly
  * destroy mappings by iterating over the pmap's collection of PV
  * entries, rather than searching the page table.  Second, it doesn't
  * have to test and clear the page table entries atomically, because
  * no processor is currently accessing the user address space.  In
  * particular, a page table entry's dirty bit won't change state once
  * this function starts.
  *
  * Although this function destroys all of the pmap's managed,
  * non-wired mappings, it can delay and batch the invalidation of TLB
  * entries without calling pmap_delayed_invl_start() and
  * pmap_delayed_invl_finish().  Because the pmap is not active on
  * any other processor, none of these TLB entries will ever be used
  * before their eventual invalidation.  Consequently, there is no need
  * for either pmap_remove_all() or pmap_remove_write() to wait for
  * that eventual TLB invalidation.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pd_entry_t ptepde;
 	pt_entry_t *pte, tpte;
 	pt_entry_t PG_M, PG_RW, PG_V;
 	struct spglist free;
 	struct pv_chunklist free_chunks[PMAP_MEMDOM];
 	vm_page_t m, mpte, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, i, idx;
 	boolean_t superpage;
 	vm_paddr_t pa;
 
 	/*
 	 * Assert that the given pmap is only active on the current
 	 * CPU.  Unfortunately, we cannot block another CPU from
 	 * activating the pmap while this function is executing.
 	 */
 	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
 #ifdef INVARIANTS
 	{
 		cpuset_t other_cpus;
 
 		other_cpus = all_cpus;
 		critical_enter();
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		critical_exit();
 		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
 	}
 #endif
 
 	lock = NULL;
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	for (i = 0; i < PMAP_MEMDOM; i++)
 		TAILQ_INIT(&free_chunks[i]);
 	SLIST_INIT(&free);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfq(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = pmap_pdpe(pmap, pv->pv_va);
 				ptepde = *pte;
 				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 				tpte = *pte;
 				if ((tpte & (PG_PS | PG_V)) == PG_V) {
 					superpage = FALSE;
 					ptepde = tpte;
 					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 					    PG_FRAME);
 					pte = &pte[pmap_pte_index(pv->pv_va)];
 					tpte = *pte;
 				} else {
 					/*
 					 * Keep track whether 'tpte' is a
 					 * superpage explicitly instead of
 					 * relying on PG_PS being set.
 					 *
 					 * This is because PG_PS is numerically
 					 * identical to PG_PTE_PAT and thus a
 					 * regular page could be mistaken for
 					 * a superpage.
 					 */
 					superpage = TRUE;
 				}
 
 				if ((tpte & PG_V) == 0) {
 					panic("bad pte va %lx pte %lx",
 					    pv->pv_va, tpte);
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
 
 				/*
 				 * Because this pmap is not active on other
 				 * processors, the dirty bit cannot have
 				 * changed state since we last loaded pte.
 				 */
 				pte_clear(pte);
 
 				if (superpage)
 					pa = tpte & PG_PS_FRAME;
 				else
 					pa = tpte & PG_FRAME;
 
 				m = PHYS_TO_VM_PAGE(pa);
 				KASSERT(m->phys_addr == pa,
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad tpte %#jx",
 				    (uintmax_t)tpte));
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 					if (superpage) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 					} else
 						vm_page_dirty(m);
 				}
 
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 
 				if (superpage) {
 					pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE);
 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 					pvh->pv_gen++;
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
 							    TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 					if (mpte != NULL) {
 						KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 						    ("pmap_remove_pages: pte page not promoted"));
 						pmap_resident_count_adj(pmap, -1);
 						KASSERT(mpte->ref_count == NPTEPG,
 						    ("pmap_remove_pages: pte page reference count error"));
 						mpte->ref_count = 0;
 						pmap_add_delayed_free_list(mpte, &free, FALSE);
 					}
 				} else {
 					pmap_resident_count_adj(pmap, -1);
 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 					m->md.pv_gen++;
 					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
 					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m, PGA_WRITEABLE);
 					}
 				}
 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 				freed++;
 			}
 		}
 		PV_STAT(counter_u64_add(pv_entry_frees, freed));
 		PV_STAT(counter_u64_add(pv_entry_spare, freed));
 		PV_STAT(counter_u64_add(pv_entry_count, -freed));
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list);
 		}
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_all(pmap);
 	pmap_pkru_deassign_all(pmap);
 	free_pv_chunk_batch((struct pv_chunklist *)&free_chunks);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 static boolean_t
 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 {
 	struct rwlock *lock;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	pt_entry_t *pte, mask;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	pmap_t pmap;
 	int md_gen, pvh_gen;
 	boolean_t rv;
 
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		mask = 0;
 		if (modified) {
 			PG_M = pmap_modified_bit(pmap);
 			PG_RW = pmap_rw_bit(pmap);
 			mask |= PG_RW | PG_M;
 		}
 		if (accessed) {
 			PG_A = pmap_accessed_bit(pmap);
 			PG_V = pmap_valid_bit(pmap);
 			mask |= PG_V | PG_A;
 		}
 		rv = (*pte & mask) == mask;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			goto out;
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pde(pmap, pv->pv_va);
 			mask = 0;
 			if (modified) {
 				PG_M = pmap_modified_bit(pmap);
 				PG_RW = pmap_rw_bit(pmap);
 				mask |= PG_RW | PG_M;
 			}
 			if (accessed) {
 				PG_A = pmap_accessed_bit(pmap);
 				PG_V = pmap_valid_bit(pmap);
 				mask |= PG_V | PG_A;
 			}
 			rv = (*pte & mask) == mask;
 			PMAP_UNLOCK(pmap);
 			if (rv)
 				goto out;
 		}
 	}
 out:
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not busied then this check is racy.
 	 */
 	if (!pmap_page_is_write_mapped(m))
 		return (FALSE);
 	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	boolean_t rv;
 
 	PG_V = pmap_valid_bit(pmap);
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pde = pmap_pde(pmap, addr);
 	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
 		pte = pmap_pde_to_pte(pde, addr);
 		rv = (*pte & PG_V) == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *pde;
 	pt_entry_t oldpte, *pte, PG_M, PG_RW;
 	vm_offset_t va;
 	int pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	vm_page_assert_busied(m);
 	if (!pmap_page_is_write_mapped(m))
 		return;
 
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	rw_wlock(lock);
 retry:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		if ((*pde & PG_RW) != 0)
 			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 		    ("inconsistent pv lock %p %p for page %p",
 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen ||
 			    md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		oldpte = *pte;
 		if (oldpte & PG_RW) {
 			while (!atomic_fcmpset_long(pte, &oldpte, oldpte &
 			    ~(PG_RW | PG_M)))
 				cpu_spinwait();
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	pmap_delayed_invl_wait(m);
 }
 
 static __inline boolean_t
 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
 {
 
 	if (!pmap_emulate_ad_bits(pmap))
 		return (TRUE);
 
 	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
 
 	/*
 	 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
 	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
 	 * if the EPT_PG_WRITE bit is set.
 	 */
 	if ((pte & EPT_PG_WRITE) != 0)
 		return (FALSE);
 
 	/*
 	 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
 	 */
 	if ((pte & EPT_PG_EXECUTE) == 0 ||
 	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
 		return (TRUE);
 	else
 		return (FALSE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  *
  *	A DI block is not needed within this function, because
  *	invalidations are performed before the PV list lock is
  *	released.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_A, PG_M, PG_RW;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	int cleared, md_gen, not_cleared, pvh_gen;
 	struct spglist free;
 	boolean_t demoted;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	SLIST_INIT(&free);
 	cleared = 0;
 	pa = VM_PAGE_TO_PHYS(m);
 	lock = PHYS_TO_PV_LIST_LOCK(pa);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 	rw_wlock(lock);
 retry:
 	not_cleared = 0;
 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, pv->pv_va);
 		oldpde = *pde;
 		if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Although "oldpde" is mapping a 2MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 		if ((oldpde & PG_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB
 			 * pages, it should not be cleared every time it is
 			 * tested.  Apply a simple "hash" function on the
 			 * physical page number, the virtual superpage number,
 			 * and the pmap address to select one 4KB page out of
 			 * the 512 on which testing the reference bit will
 			 * result in clearing that reference bit.  This
 			 * function is designed to avoid the selection of the
 			 * same 4KB page for every 2MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 			    (oldpde & PG_W) == 0) {
 				if (safe_to_clear_referenced(pmap, oldpde)) {
 					atomic_clear_long(pde, PG_A);
 					pmap_invalidate_page(pmap, pv->pv_va);
 					demoted = FALSE;
 				} else if (pmap_demote_pde_locked(pmap, pde,
 				    pv->pv_va, &lock)) {
 					/*
 					 * Remove the mapping to a single page
 					 * so that a subsequent access may
 					 * repromote.  Since the underlying
 					 * page table page is fully populated,
 					 * this removal never frees a page
 					 * table page.
 					 */
 					demoted = TRUE;
 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
 					    PG_PS_FRAME);
 					pte = pmap_pde_to_pte(pde, va);
 					pmap_remove_pte(pmap, pte, va, *pde,
 					    NULL, &lock);
 					pmap_invalidate_page(pmap, va);
 				} else
 					demoted = TRUE;
 
 				if (demoted) {
 					/*
 					 * The superpage mapping was removed
 					 * entirely and therefore 'pv' is no
 					 * longer valid.
 					 */
 					if (pvf == pv)
 						pvf = NULL;
 					pv = NULL;
 				}
 				cleared++;
 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 				    ("inconsistent pv lock %p %p for page %p",
 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 		}
 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if ((*pte & PG_A) != 0) {
 			if (safe_to_clear_referenced(pmap, *pte)) {
 				atomic_clear_long(pte, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else if ((*pte & PG_W) == 0) {
 				/*
 				 * Wired pages cannot be paged out so
 				 * doing accessed bit emulation for
 				 * them is wasted effort. We do the
 				 * hard work for unwired pages only.
 				 */
 				pmap_remove_pte(pmap, pte, pv->pv_va,
 				    *pde, &free, &lock);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 				if (pvf == pv)
 					pvf = NULL;
 				pv = NULL;
 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 				    ("inconsistent pv lock %p %p for page %p",
 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 			m->md.pv_gen++;
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 	    not_cleared < PMAP_TS_REFERENCED_MAX);
 out:
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 	return (cleared + not_cleared);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 	struct rwlock *lock;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
 	vm_offset_t va, va_next;
 	vm_page_t m;
 	bool anychanged;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 
 	/*
 	 * A/D bit emulation requires an alternate code path when clearing
 	 * the modified and accessed bits below. Since this function is
 	 * advisory in nature we skip it entirely for pmaps that require
 	 * A/D bit emulation.
 	 */
 	if (pmap_emulate_ad_bits(pmap))
 		return;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	anychanged = false;
 	pmap_delayed_invl_start();
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		pml4e = pmap_pml4e(pmap, sva);
 		if (pml4e == NULL || (*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + NBPDP) & ~PDPMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0)
 			continue;
 		if ((*pdpe & PG_PS) != 0) {
 			KASSERT(va_next <= eva,
 			    ("partial update of non-transparent 1G mapping "
 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
 			    *pdpe, sva, eva, va_next));
 			continue;
 		}
 
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		oldpde = *pde;
 		if ((oldpde & PG_V) == 0)
 			continue;
 		else if ((oldpde & PG_PS) != 0) {
 			if ((oldpde & PG_MANAGED) == 0)
 				continue;
 			lock = NULL;
 			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
 				if (lock != NULL)
 					rw_wunlock(lock);
 
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Choosing the last page
 			 * within the address range [sva, min(va_next, eva))
 			 * generally results in more repromotions.  Since the
 			 * underlying page table page is fully populated, this
 			 * removal never frees a page table page.
 			 */
 			if ((oldpde & PG_W) == 0) {
 				va = eva;
 				if (va > va_next)
 					va = va_next;
 				va -= PAGE_SIZE;
 				KASSERT(va >= sva,
 				    ("pmap_advise: no address gap"));
 				pte = pmap_pde_to_pte(pde, va);
 				KASSERT((*pte & PG_V) != 0,
 				    ("pmap_advise: invalid PTE"));
 				pmap_remove_pte(pmap, pte, va, *pde, NULL,
 				    &lock);
 				anychanged = true;
 			}
 			if (lock != NULL)
 				rw_wunlock(lock);
 		}
 		if (va_next > eva)
 			va_next = eva;
 		va = va_next;
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 				goto maybe_invlrng;
 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				atomic_clear_long(pte, PG_M | PG_A);
 			} else if ((*pte & PG_A) != 0)
 				atomic_clear_long(pte, PG_A);
 			else
 				goto maybe_invlrng;
 
 			if ((*pte & PG_G) != 0) {
 				if (va == va_next)
 					va = sva;
 			} else
 				anychanged = true;
 			continue;
 maybe_invlrng:
 			if (va != va_next) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = va_next;
 			}
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	pmap_delayed_invl_finish();
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_M, PG_RW;
 	struct rwlock *lock;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	vm_page_assert_busied(m);
 
 	if (!pmap_page_is_write_mapped(m))
 		return;
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_wlock(lock);
 restart:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		oldpde = *pde;
 		/* If oldpde has PG_RW set, then it also has PG_M set. */
 		if ((oldpde & PG_RW) != 0 &&
 		    pmap_demote_pde_locked(pmap, pde, va, &lock) &&
 		    (oldpde & PG_W) == 0) {
 			/*
 			 * Write protect the mapping to a single page so that
 			 * a subsequent write access may repromote.
 			 */
 			va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME);
 			pte = pmap_pde_to_pte(pde, va);
 			atomic_clear_long(pte, PG_M | PG_RW);
 			vm_page_dirty(m);
 			pmap_invalidate_page(pmap, va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			atomic_clear_long(pte, PG_M);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /* Adjust the properties for a leaf page table entry. */
 static __inline void
 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask)
 {
 	u_long opte, npte;
 
 	opte = *(u_long *)pte;
 	do {
 		npte = opte & ~mask;
 		npte |= bits;
 	} while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte,
 	    npte));
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 static void *
 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	vm_size_t tmpsize;
 	int i;
 
 	offset = pa & PAGE_MASK;
 	size = round_page(offset + size);
 	pa = trunc_page(pa);
 
 	if (!pmap_initialized) {
 		va = 0;
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->va == 0) {
 				ppim->pa = pa;
 				ppim->sz = size;
 				ppim->mode = mode;
 				ppim->va = virtual_avail;
 				virtual_avail += size;
 				va = ppim->va;
 				break;
 			}
 		}
 		if (va == 0)
 			panic("%s: too many preinit mappings", __func__);
 	} else {
 		/*
 		 * If we have a preinit mapping, re-use it.
 		 */
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->pa == pa && ppim->sz == size &&
 			    (ppim->mode == mode ||
 			    (flags & MAPDEV_SETATTR) == 0))
 				return ((void *)(ppim->va + offset));
 		}
 		/*
 		 * If the specified range of physical addresses fits within
 		 * the direct map window, use the direct map.
 		 */
 		if (pa < dmaplimit && pa + size <= dmaplimit) {
 			va = PHYS_TO_DMAP(pa);
 			if ((flags & MAPDEV_SETATTR) != 0) {
 				PMAP_LOCK(kernel_pmap);
 				i = pmap_change_props_locked(va, size,
 				    PROT_NONE, mode, flags);
 				PMAP_UNLOCK(kernel_pmap);
 			} else
 				i = 0;
 			if (!i)
 				return ((void *)(va + offset));
 		}
 		va = kva_alloc(size);
 		if (va == 0)
 			panic("%s: Couldn't allocate KVA", __func__);
 	}
 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 	if ((flags & MAPDEV_FLUSHCACHE) != 0)
 		pmap_invalidate_cache_range(va, va + tmpsize);
 	return ((void *)(va + offset));
 }
 
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 
 	return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE |
 	    MAPDEV_SETATTR));
 }
 
 void *
 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 }
 
 void *
 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE,
 	    MAPDEV_SETATTR));
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK,
 	    MAPDEV_FLUSHCACHE));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t offset;
 	int i;
 
 	/* If we gave a direct map region in pmap_mapdev, do nothing */
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 		return;
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 	va = trunc_page(va);
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == va && ppim->sz == size) {
 			if (pmap_initialized)
 				return;
 			ppim->pa = 0;
 			ppim->va = 0;
 			ppim->sz = 0;
 			ppim->mode = 0;
 			if (va + size == virtual_avail)
 				virtual_avail = va;
 			return;
 		}
 	}
 	if (pmap_initialized) {
 		pmap_qremove(va, atop(size));
 		kva_free(va, size);
 	}
 }
 
 /*
  * Tries to demote a 1GB page mapping.
  */
 static boolean_t
 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
 {
 	pdp_entry_t newpdpe, oldpdpe;
 	pd_entry_t *firstpde, newpde, *pde;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pdpgpa;
 	vm_page_t pdpg;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpdpe = *pdpe;
 	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
 	pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT,
 	    VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT);
 	if (pdpg  == NULL) {
 		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
 	}
 	pdpgpa = VM_PAGE_TO_PHYS(pdpg);
 	firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
 	newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpdpe & PG_A) != 0,
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
 	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
 	newpde = oldpdpe;
 
 	/*
 	 * Initialize the page directory page.
 	 */
 	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
 		*pde = newpde;
 		newpde += NBPDR;
 	}
 
 	/*
 	 * Demote the mapping.
 	 */
 	*pdpe = newpdpe;
 
 	/*
 	 * Invalidate a stale recursive mapping of the page directory page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
 
 	counter_u64_add(pmap_pdpe_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pat_mode = ma;
 
 	/*
 	 * If "m" is a normal page, update its direct mapping.  This update
 	 * can be relied upon to perform any cache operations that are
 	 * required for data coherence.
 	 */
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 	    m->md.pat_mode))
 		panic("memory attribute change on the direct map failed");
 }
 
 void
 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma)
 {
 	int error;
 
 	m->md.pat_mode = ma;
 
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)),
 	    PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0);
 	PMAP_UNLOCK(kernel_pmap);
 	if (error != 0)
 		panic("memory attribute change on the direct map failed");
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the direct map or the kernel map.  If
  * the virtual address range is contained within the kernel map, then the
  * memory type for each of the corresponding ranges of the direct map is also
  * changed.  (The corresponding ranges of the direct map are those ranges that
  * map the same physical pages as the specified virtual address range.)  These
  * changes to the direct map are necessary because Intel describes the
  * behavior of their processors as "undefined" if two or more mappings to the
  * same physical page have different memory types.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.  In the
  * latter case, the memory type may have been changed on some part of the
  * virtual address range or the direct map.
  */
 int
 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 {
 	int error;
 
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_props_locked(va, size, PROT_NONE, mode,
 	    MAPDEV_FLUSHCACHE);
 	PMAP_UNLOCK(kernel_pmap);
 	return (error);
 }
 
 /*
  * Changes the specified virtual address range's protections to those
  * specified by "prot".  Like pmap_change_attr(), protections for aliases
  * in the direct map are updated as well.  Protections on aliasing mappings may
  * be a subset of the requested protections; for example, mappings in the direct
  * map are never executable.
  */
 int
 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
 {
 	int error;
 
 	/* Only supported within the kernel map. */
 	if (va < VM_MIN_KERNEL_ADDRESS)
 		return (EINVAL);
 
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_props_locked(va, size, prot, -1,
 	    MAPDEV_ASSERTVALID);
 	PMAP_UNLOCK(kernel_pmap);
 	return (error);
 }
 
 static int
 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
     int mode, int flags)
 {
 	vm_offset_t base, offset, tmpva;
 	vm_paddr_t pa_start, pa_end, pa_end1;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde, pde_bits, pde_mask;
 	pt_entry_t *pte, pte_bits, pte_mask;
 	int error;
 	bool changed;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	/*
 	 * Only supported on kernel virtual addresses, including the direct
 	 * map but excluding the recursive map.
 	 */
 	if (base < DMAP_MIN_ADDRESS)
 		return (EINVAL);
 
 	/*
 	 * Construct our flag sets and masks.  "bits" is the subset of
 	 * "mask" that will be set in each modified PTE.
 	 *
 	 * Mappings in the direct map are never allowed to be executable.
 	 */
 	pde_bits = pte_bits = 0;
 	pde_mask = pte_mask = 0;
 	if (mode != -1) {
 		pde_bits |= pmap_cache_bits(kernel_pmap, mode, true);
 		pde_mask |= X86_PG_PDE_CACHE;
 		pte_bits |= pmap_cache_bits(kernel_pmap, mode, false);
 		pte_mask |= X86_PG_PTE_CACHE;
 	}
 	if (prot != VM_PROT_NONE) {
 		if ((prot & VM_PROT_WRITE) != 0) {
 			pde_bits |= X86_PG_RW;
 			pte_bits |= X86_PG_RW;
 		}
 		if ((prot & VM_PROT_EXECUTE) == 0 ||
 		    va < VM_MIN_KERNEL_ADDRESS) {
 			pde_bits |= pg_nx;
 			pte_bits |= pg_nx;
 		}
 		pde_mask |= X86_PG_RW | pg_nx;
 		pte_mask |= X86_PG_RW | pg_nx;
 	}
 
 	/*
 	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
 	 * into 4KB pages if required.
 	 */
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
 		if (pdpe == NULL || *pdpe == 0) {
 			KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
 			    ("%s: addr %#lx is not mapped", __func__, tmpva));
 			return (EINVAL);
 		}
 		if (*pdpe & PG_PS) {
 			/*
 			 * If the current 1GB page already has the required
 			 * properties, then we need not demote this page.  Just
 			 * increment tmpva to the next 1GB page frame.
 			 */
 			if ((*pdpe & pde_mask) == pde_bits) {
 				tmpva = trunc_1gpage(tmpva) + NBPDP;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 1GB page frame
 			 * and there is at least 1GB left within the range, then
 			 * we need not break down this page into 2MB pages.
 			 */
 			if ((tmpva & PDPMASK) == 0 &&
 			    tmpva + PDPMASK < base + size) {
 				tmpva += NBPDP;
 				continue;
 			}
 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
 				return (ENOMEM);
 		}
 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
 		if (*pde == 0) {
 			KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
 			    ("%s: addr %#lx is not mapped", __func__, tmpva));
 			return (EINVAL);
 		}
 		if (*pde & PG_PS) {
 			/*
 			 * If the current 2MB page already has the required
 			 * properties, then we need not demote this page.  Just
 			 * increment tmpva to the next 2MB page frame.
 			 */
 			if ((*pde & pde_mask) == pde_bits) {
 				tmpva = trunc_2mpage(tmpva) + NBPDR;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 2MB page frame
 			 * and there is at least 2MB left within the range, then
 			 * we need not break down this page into 4KB pages.
 			 */
 			if ((tmpva & PDRMASK) == 0 &&
 			    tmpva + PDRMASK < base + size) {
 				tmpva += NBPDR;
 				continue;
 			}
 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
 				return (ENOMEM);
 		}
 		pte = pmap_pde_to_pte(pde, tmpva);
 		if (*pte == 0) {
 			KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
 			    ("%s: addr %#lx is not mapped", __func__, tmpva));
 			return (EINVAL);
 		}
 		tmpva += PAGE_SIZE;
 	}
 	error = 0;
 
 	/*
 	 * Ok, all the pages exist, so run through them updating their
 	 * properties if required.
 	 */
 	changed = false;
 	pa_start = pa_end = 0;
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
 		if (*pdpe & PG_PS) {
 			if ((*pdpe & pde_mask) != pde_bits) {
 				pmap_pte_props(pdpe, pde_bits, pde_mask);
 				changed = true;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pdpe & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pdpe & PG_PS_FRAME;
 					pa_end = pa_start + NBPDP;
 				} else if (pa_end == (*pdpe & PG_PS_FRAME))
 					pa_end += NBPDP;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_props_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, prot, mode,
 					    flags);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pdpe & PG_PS_FRAME;
 					pa_end = pa_start + NBPDP;
 				}
 			}
 			tmpva = trunc_1gpage(tmpva) + NBPDP;
 			continue;
 		}
 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
 		if (*pde & PG_PS) {
 			if ((*pde & pde_mask) != pde_bits) {
 				pmap_pte_props(pde, pde_bits, pde_mask);
 				changed = true;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pde & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pde & PG_PS_FRAME;
 					pa_end = pa_start + NBPDR;
 				} else if (pa_end == (*pde & PG_PS_FRAME))
 					pa_end += NBPDR;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_props_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, prot, mode,
 					    flags);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pde & PG_PS_FRAME;
 					pa_end = pa_start + NBPDR;
 				}
 			}
 			tmpva = trunc_2mpage(tmpva) + NBPDR;
 		} else {
 			pte = pmap_pde_to_pte(pde, tmpva);
 			if ((*pte & pte_mask) != pte_bits) {
 				pmap_pte_props(pte, pte_bits, pte_mask);
 				changed = true;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pte & PG_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pte & PG_FRAME;
 					pa_end = pa_start + PAGE_SIZE;
 				} else if (pa_end == (*pte & PG_FRAME))
 					pa_end += PAGE_SIZE;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_props_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, prot, mode,
 					    flags);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pte & PG_FRAME;
 					pa_end = pa_start + PAGE_SIZE;
 				}
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
 		pa_end1 = MIN(pa_end, dmaplimit);
 		if (pa_start != pa_end1)
 			error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start),
 			    pa_end1 - pa_start, prot, mode, flags);
 	}
 
 	/*
 	 * Flush CPU caches if required to make sure any data isn't cached that
 	 * shouldn't be, etc.
 	 */
 	if (changed) {
 		pmap_invalidate_range(kernel_pmap, base, tmpva);
 		if ((flags & MAPDEV_FLUSHCACHE) != 0)
 			pmap_invalidate_cache_range(base, tmpva);
 	}
 	return (error);
 }
 
 /*
  * Demotes any mapping within the direct map region that covers more than the
  * specified range of physical addresses.  This range's size must be a power
  * of two and its starting address must be a multiple of its size.  Since the
  * demotion does not change any attributes of the mapping, a TLB invalidation
  * is not mandatory.  The caller may, however, request a TLB invalidation.
  */
 void
 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	boolean_t changed;
 
 	if (len == 0)
 		return;
 	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
 	KASSERT((base & (len - 1)) == 0,
 	    ("pmap_demote_DMAP: base is not a multiple of len"));
 	if (len < NBPDP && base < dmaplimit) {
 		va = PHYS_TO_DMAP(base);
 		changed = FALSE;
 		PMAP_LOCK(kernel_pmap);
 		pdpe = pmap_pdpe(kernel_pmap, va);
 		if ((*pdpe & X86_PG_V) == 0)
 			panic("pmap_demote_DMAP: invalid PDPE");
 		if ((*pdpe & PG_PS) != 0) {
 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
 				panic("pmap_demote_DMAP: PDPE failed");
 			changed = TRUE;
 		}
 		if (len < NBPDR) {
 			pde = pmap_pdpe_to_pde(pdpe, va);
 			if ((*pde & X86_PG_V) == 0)
 				panic("pmap_demote_DMAP: invalid PDE");
 			if ((*pde & PG_PS) != 0) {
 				if (!pmap_demote_pde(kernel_pmap, pde, va))
 					panic("pmap_demote_DMAP: PDE failed");
 				changed = TRUE;
 			}
 		}
 		if (changed && invalidate)
 			pmap_invalidate_page(kernel_pmap, va);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 /*
  * Perform the pmap work for mincore(2).  If the page is not both referenced and
  * modified by this pmap, returns its physical address so that the caller can
  * find other mappings.
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pdep;
 	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pa;
 	int val;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK(pmap);
 	pte = 0;
 	pa = 0;
 	val = 0;
 	pdpe = pmap_pdpe(pmap, addr);
 	if (pdpe == NULL)
 		goto out;
 	if ((*pdpe & PG_V) != 0) {
 		if ((*pdpe & PG_PS) != 0) {
 			pte = *pdpe;
 			pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) &
 			    PG_FRAME;
 			val = MINCORE_PSIND(2);
 		} else {
 			pdep = pmap_pde(pmap, addr);
 			if (pdep != NULL && (*pdep & PG_V) != 0) {
 				if ((*pdep & PG_PS) != 0) {
 					pte = *pdep;
 			/* Compute the physical address of the 4KB page. */
 					pa = ((pte & PG_PS_FRAME) | (addr &
 					    PDRMASK)) & PG_FRAME;
 					val = MINCORE_PSIND(1);
 				} else {
 					pte = *pmap_pde_to_pte(pdep, addr);
 					pa = pte & PG_FRAME;
 					val = 0;
 				}
 			}
 		}
 	}
 	if ((pte & PG_V) != 0) {
 		val |= MINCORE_INCORE;
 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((pte & PG_A) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 		*pap = pa;
 	}
 out:
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 static uint64_t
 pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
 {
 	uint32_t gen, new_gen, pcid_next;
 
 	CRITICAL_ASSERT(curthread);
 	gen = PCPU_GET(pcid_gen);
 	if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN)
 		return (pti ? 0 : CR3_PCID_SAVE);
 	if (pmap->pm_pcids[cpuid].pm_gen == gen)
 		return (CR3_PCID_SAVE);
 	pcid_next = PCPU_GET(pcid_next);
 	KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
 	    (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
 	    ("cpu %d pcid_next %#x", cpuid, pcid_next));
 	if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
 	    (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
 		new_gen = gen + 1;
 		if (new_gen == 0)
 			new_gen = 1;
 		PCPU_SET(pcid_gen, new_gen);
 		pcid_next = PMAP_PCID_KERN + 1;
 	} else {
 		new_gen = gen;
 	}
 	pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
 	pmap->pm_pcids[cpuid].pm_gen = new_gen;
 	PCPU_SET(pcid_next, pcid_next + 1);
 	return (0);
 }
 
 static uint64_t
 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid)
 {
 	uint64_t cached;
 
 	cached = pmap_pcid_alloc(pmap, cpuid);
 	KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
 	    ("pmap %p cpu %d pcid %#x", pmap, cpuid,
 	    pmap->pm_pcids[cpuid].pm_pcid));
 	KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
 	    pmap == kernel_pmap,
 	    ("non-kernel pmap pmap %p cpu %d pcid %#x",
 	    pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
 	return (cached);
 }
 
 static void
 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap)
 {
 
 	PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ?
 	    PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base;
 }
 
 static void
 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid)
 {
 	pmap_t old_pmap;
 	uint64_t cached, cr3, kcr3, ucr3;
 
 	KASSERT((read_rflags() & PSL_I) == 0,
 	    ("PCID needs interrupts disabled in pmap_activate_sw()"));
 
 	/* See the comment in pmap_invalidate_page_pcid(). */
 	if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) {
 		PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
 		old_pmap = PCPU_GET(curpmap);
 		MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3);
 		old_pmap->pm_pcids[cpuid].pm_gen = 0;
 	}
 
 	cached = pmap_pcid_alloc_checked(pmap, cpuid);
 	cr3 = rcr3();
 	if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
 		load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid);
 	PCPU_SET(curpmap, pmap);
 	kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
 	ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
 	    PMAP_PCID_USER_PT;
 
 	if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3)
 		PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
 
 	PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
 	PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
 	if (cached)
 		counter_u64_add(pcid_save_cnt, 1);
 
 	pmap_activate_sw_pti_post(td, pmap);
 }
 
 static void
 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap,
     u_int cpuid)
 {
 	uint64_t cached, cr3;
 
 	KASSERT((read_rflags() & PSL_I) == 0,
 	    ("PCID needs interrupts disabled in pmap_activate_sw()"));
 
 	cached = pmap_pcid_alloc_checked(pmap, cpuid);
 	cr3 = rcr3();
 	if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
 		load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
 		    cached);
 	PCPU_SET(curpmap, pmap);
 	if (cached)
 		counter_u64_add(pcid_save_cnt, 1);
 }
 
 static void
 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap,
     u_int cpuid __unused)
 {
 
 	load_cr3(pmap->pm_cr3);
 	PCPU_SET(curpmap, pmap);
 }
 
 static void
 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap,
     u_int cpuid __unused)
 {
 
 	pmap_activate_sw_nopcid_nopti(td, pmap, cpuid);
 	PCPU_SET(kcr3, pmap->pm_cr3);
 	PCPU_SET(ucr3, pmap->pm_ucr3);
 	pmap_activate_sw_pti_post(td, pmap);
 }
 
 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t,
     u_int))
 {
 
 	if (pmap_pcid_enabled && pti)
 		return (pmap_activate_sw_pcid_pti);
 	else if (pmap_pcid_enabled && !pti)
 		return (pmap_activate_sw_pcid_nopti);
 	else if (!pmap_pcid_enabled && pti)
 		return (pmap_activate_sw_nopcid_pti);
 	else /* if (!pmap_pcid_enabled && !pti) */
 		return (pmap_activate_sw_nopcid_nopti);
 }
 
 void
 pmap_activate_sw(struct thread *td)
 {
 	pmap_t oldpmap, pmap;
 	u_int cpuid;
 
 	oldpmap = PCPU_GET(curpmap);
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	if (oldpmap == pmap) {
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		return;
 	}
 	cpuid = PCPU_GET(cpuid);
 #ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 	pmap_activate_sw_mode(td, pmap, cpuid);
 #ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 #else
 	CPU_CLR(cpuid, &oldpmap->pm_active);
 #endif
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	/*
 	 * invltlb_{invpcid,}_pcid_handler() is used to handle an
 	 * invalidate_all IPI, which checks for curpmap ==
 	 * smp_tlb_pmap.  The below sequence of operations has a
 	 * window where %CR3 is loaded with the new pmap's PML4
 	 * address, but the curpmap value has not yet been updated.
 	 * This causes the invltlb IPI handler, which is called
 	 * between the updates, to execute as a NOP, which leaves
 	 * stale TLB entries.
 	 *
 	 * Note that the most common use of pmap_activate_sw(), from
 	 * a context switch, is immune to this race, because
 	 * interrupts are disabled (while the thread lock is owned),
 	 * so the IPI is delayed until after curpmap is updated.  Protect
 	 * other callers in a similar way, by disabling interrupts
 	 * around the %cr3 register reload and curpmap assignment.
 	 */
 	spinlock_enter();
 	pmap_activate_sw(td);
 	spinlock_exit();
 }
 
 void
 pmap_activate_boot(pmap_t pmap)
 {
 	uint64_t kcr3;
 	u_int cpuid;
 
 	/*
 	 * kernel_pmap must be never deactivated, and we ensure that
 	 * by never activating it at all.
 	 */
 	MPASS(pmap != kernel_pmap);
 
 	cpuid = PCPU_GET(cpuid);
 #ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 	PCPU_SET(curpmap, pmap);
 	if (pti) {
 		kcr3 = pmap->pm_cr3;
 		if (pmap_pcid_enabled)
 			kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE;
 	} else {
 		kcr3 = PMAP_NO_CR3;
 	}
 	PCPU_SET(kcr3, kcr3);
 	PCPU_SET(ucr3, PMAP_NO_CR3);
 }
 
 void
 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 {
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < NBPDR)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & PDRMASK;
 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 	    (*addr & PDRMASK) == superpage_offset)
 		return;
 	if ((*addr & PDRMASK) < superpage_offset)
 		*addr = (*addr & ~PDRMASK) + superpage_offset;
 	else
 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 }
 
 #ifdef INVARIANTS
 static unsigned long num_dirty_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
 	     &num_dirty_emulations, 0, NULL);
 
 static unsigned long num_accessed_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
 	     &num_accessed_emulations, 0, NULL);
 
 static unsigned long num_superpage_accessed_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
 	     &num_superpage_accessed_emulations, 0, NULL);
 
 static unsigned long ad_emulation_superpage_promotions;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
 	     &ad_emulation_superpage_promotions, 0, NULL);
 #endif	/* INVARIANTS */
 
 int
 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
 {
 	int rv;
 	struct rwlock *lock;
 #if VM_NRESERVLEVEL > 0
 	vm_page_t m, mpte;
 #endif
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
 
 	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
 	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
 
 	if (!pmap_emulate_ad_bits(pmap))
 		return (-1);
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	rv = -1;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		goto done;
 
 	if ((*pde & PG_PS) != 0) {
 		if (ftype == VM_PROT_READ) {
 #ifdef INVARIANTS
 			atomic_add_long(&num_superpage_accessed_emulations, 1);
 #endif
 			*pde |= PG_A;
 			rv = 0;
 		}
 		goto done;
 	}
 
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		goto done;
 
 	if (ftype == VM_PROT_WRITE) {
 		if ((*pte & PG_RW) == 0)
 			goto done;
 		/*
 		 * Set the modified and accessed bits simultaneously.
 		 *
 		 * Intel EPT PTEs that do software emulation of A/D bits map
 		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
 		 * An EPT misconfiguration is triggered if the PTE is writable
 		 * but not readable (WR=10). This is avoided by setting PG_A
 		 * and PG_M simultaneously.
 		 */
 		*pte |= PG_M | PG_A;
 	} else {
 		*pte |= PG_A;
 	}
 
 #if VM_NRESERVLEVEL > 0
 	/* try to promote the mapping */
 	if (va < VM_MAXUSER_ADDRESS)
 		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	else
 		mpte = NULL;
 
 	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 
 	if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0) {
 		pmap_promote_pde(pmap, pde, va, &lock);
 #ifdef INVARIANTS
 		atomic_add_long(&ad_emulation_superpage_promotions, 1);
 #endif
 	}
 #endif
 
 #ifdef INVARIANTS
 	if (ftype == VM_PROT_WRITE)
 		atomic_add_long(&num_dirty_emulations, 1);
 	else
 		atomic_add_long(&num_accessed_emulations, 1);
 #endif
 	rv = 0;		/* success */
 done:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 void
 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
 {
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	int idx;
 
 	idx = 0;
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 
 	pml4 = pmap_pml4e(pmap, va);
 	if (pml4 == NULL)
 		goto done;
 	ptr[idx++] = *pml4;
 	if ((*pml4 & PG_V) == 0)
 		goto done;
 
 	pdp = pmap_pml4e_to_pdpe(pml4, va);
 	ptr[idx++] = *pdp;
 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
 		goto done;
 
 	pde = pmap_pdpe_to_pde(pdp, va);
 	ptr[idx++] = *pde;
 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
 		goto done;
 
 	pte = pmap_pde_to_pte(pde, va);
 	ptr[idx++] = *pte;
 
 done:
 	PMAP_UNLOCK(pmap);
 	*num = idx;
 }
 
 /**
  * Get the kernel virtual address of a set of physical pages. If there are
  * physical addresses not covered by the DMAP perform a transient mapping
  * that will be removed when calling pmap_unmap_io_transient.
  *
  * \param page        The pages the caller wishes to obtain the virtual
  *                    address on the kernel memory map.
  * \param vaddr       On return contains the kernel virtual memory address
  *                    of the pages passed in the page parameter.
  * \param count       Number of pages passed in.
  * \param can_fault   TRUE if the thread using the mapped pages can take
  *                    page faults, FALSE otherwise.
  *
  * \returns TRUE if the caller must call pmap_unmap_io_transient when
  *          finished or FALSE otherwise.
  *
  */
 boolean_t
 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	boolean_t needs_mapping;
 	pt_entry_t *pte;
 	int cache_bits, error __unused, i;
 
 	/*
 	 * Allocate any KVA space that we need, this is done in a separate
 	 * loop to prevent calling vmem_alloc while pinned.
 	 */
 	needs_mapping = FALSE;
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (__predict_false(paddr >= dmaplimit)) {
 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 			needs_mapping = TRUE;
 		} else {
 			vaddr[i] = PHYS_TO_DMAP(paddr);
 		}
 	}
 
 	/* Exit early if everything is covered by the DMAP */
 	if (!needs_mapping)
 		return (FALSE);
 
 	/*
 	 * NB:  The sequence of updating a page table followed by accesses
 	 * to the corresponding pages used in the !DMAP case is subject to
 	 * the situation described in the "AMD64 Architecture Programmer's
 	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
 	 * Coherency Considerations".  Therefore, issuing the INVLPG right
 	 * after modifying the PTE bits is crucial.
 	 */
 	if (!can_fault)
 		sched_pin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= dmaplimit) {
 			if (can_fault) {
 				/*
 				 * Slow path, since we can get page faults
 				 * while mappings are active don't pin the
 				 * thread to the CPU and instead add a global
 				 * mapping visible to all CPUs.
 				 */
 				pmap_qenter(vaddr[i], &page[i], 1);
 			} else {
 				pte = vtopte(vaddr[i]);
 				cache_bits = pmap_cache_bits(kernel_pmap,
 				    page[i]->md.pat_mode, 0);
 				pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
 				    cache_bits);
 				invlpg(vaddr[i]);
 			}
 		}
 	}
 
 	return (needs_mapping);
 }
 
 void
 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	int i;
 
 	if (!can_fault)
 		sched_unpin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= dmaplimit) {
 			if (can_fault)
 				pmap_qremove(vaddr[i], 1);
 			vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
 		}
 	}
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 	vm_paddr_t paddr;
 
 	paddr = VM_PAGE_TO_PHYS(m);
 	if (paddr < dmaplimit)
 		return (PHYS_TO_DMAP(paddr));
 	mtx_lock_spin(&qframe_mtx);
 	KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
 	pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
 	    X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
 	return (qframe);
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 
 	if (addr != qframe)
 		return;
 	pte_store(vtopte(qframe), 0);
 	invlpg(qframe);
 	mtx_unlock_spin(&qframe_mtx);
 }
 
 /*
  * Pdp pages from the large map are managed differently from either
  * kernel or user page table pages.  They are permanently allocated at
  * initialization time, and their reference count is permanently set to
  * zero.  The pml4 entries pointing to those pages are copied into
  * each allocated pmap.
  *
  * In contrast, pd and pt pages are managed like user page table
  * pages.  They are dynamically allocated, and their reference count
  * represents the number of valid entries within the page.
  */
 static vm_page_t
 pmap_large_map_getptp_unlocked(void)
 {
 	return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO));
 }
 
 static vm_page_t
 pmap_large_map_getptp(void)
 {
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	m = pmap_large_map_getptp_unlocked();
 	if (m == NULL) {
 		PMAP_UNLOCK(kernel_pmap);
 		vm_wait(NULL);
 		PMAP_LOCK(kernel_pmap);
 		/* Callers retry. */
 	}
 	return (m);
 }
 
 static pdp_entry_t *
 pmap_large_map_pdpe(vm_offset_t va)
 {
 	vm_pindex_t pml4_idx;
 	vm_paddr_t mphys;
 
 	pml4_idx = pmap_pml4e_index(va);
 	KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
 	    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
 	    "%#jx lm_ents %d",
 	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
 	KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
 	    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
 	    "LMSPML4I %#jx lm_ents %d",
 	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
 	mphys = kernel_pml4[pml4_idx] & PG_FRAME;
 	return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
 }
 
 static pd_entry_t *
 pmap_large_map_pde(vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	vm_page_t m;
 	vm_paddr_t mphys;
 
 retry:
 	pdpe = pmap_large_map_pdpe(va);
 	if (*pdpe == 0) {
 		m = pmap_large_map_getptp();
 		if (m == NULL)
 			goto retry;
 		mphys = VM_PAGE_TO_PHYS(m);
 		*pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
 	} else {
 		MPASS((*pdpe & X86_PG_PS) == 0);
 		mphys = *pdpe & PG_FRAME;
 	}
 	return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va));
 }
 
 static pt_entry_t *
 pmap_large_map_pte(vm_offset_t va)
 {
 	pd_entry_t *pde;
 	vm_page_t m;
 	vm_paddr_t mphys;
 
 retry:
 	pde = pmap_large_map_pde(va);
 	if (*pde == 0) {
 		m = pmap_large_map_getptp();
 		if (m == NULL)
 			goto retry;
 		mphys = VM_PAGE_TO_PHYS(m);
 		*pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
 		PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++;
 	} else {
 		MPASS((*pde & X86_PG_PS) == 0);
 		mphys = *pde & PG_FRAME;
 	}
 	return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va));
 }
 
 static vm_paddr_t
 pmap_large_map_kextract(vm_offset_t va)
 {
 	pdp_entry_t *pdpe, pdp;
 	pd_entry_t *pde, pd;
 	pt_entry_t *pte, pt;
 
 	KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va),
 	    ("not largemap range %#lx", (u_long)va));
 	pdpe = pmap_large_map_pdpe(va);
 	pdp = *pdpe;
 	KASSERT((pdp & X86_PG_V) != 0,
 	    ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
 	    (u_long)pdpe, pdp));
 	if ((pdp & X86_PG_PS) != 0) {
 		KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
 		    ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
 		    (u_long)pdpe, pdp));
 		return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK));
 	}
 	pde = pmap_pdpe_to_pde(pdpe, va);
 	pd = *pde;
 	KASSERT((pd & X86_PG_V) != 0,
 	    ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd));
 	if ((pd & X86_PG_PS) != 0)
 		return ((pd & PG_PS_FRAME) | (va & PDRMASK));
 	pte = pmap_pde_to_pte(pde, va);
 	pt = *pte;
 	KASSERT((pt & X86_PG_V) != 0,
 	    ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt));
 	return ((pt & PG_FRAME) | (va & PAGE_MASK));
 }
 
 static int
 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase,
     vmem_addr_t *vmem_res)
 {
 
 	/*
 	 * Large mappings are all but static.  Consequently, there
 	 * is no point in waiting for an earlier allocation to be
 	 * freed.
 	 */
 	return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN,
 	    VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res));
 }
 
 int
 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr,
     vm_memattr_t mattr)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_offset_t va, inc;
 	vmem_addr_t vmem_res;
 	vm_paddr_t pa;
 	int error;
 
 	if (len == 0 || spa + len < spa)
 		return (EINVAL);
 
 	/* See if DMAP can serve. */
 	if (spa + len <= dmaplimit) {
 		va = PHYS_TO_DMAP(spa);
 		*addr = (void *)va;
 		return (pmap_change_attr(va, len, mattr));
 	}
 
 	/*
 	 * No, allocate KVA.  Fit the address with best possible
 	 * alignment for superpages.  Fall back to worse align if
 	 * failed.
 	 */
 	error = ENOMEM;
 	if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len,
 	    NBPDP) >= roundup2(spa, NBPDP) + NBPDP)
 		error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK,
 		    &vmem_res);
 	if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa,
 	    NBPDR) + NBPDR)
 		error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK,
 		    &vmem_res);
 	if (error != 0)
 		error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Fill pagetable.  PG_M is not pre-set, we scan modified bits
 	 * in the pagetable to minimize flushing.  No need to
 	 * invalidate TLB, since we only update invalid entries.
 	 */
 	PMAP_LOCK(kernel_pmap);
 	for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc,
 	    len -= inc) {
 		if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP &&
 		    (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) {
 			pdpe = pmap_large_map_pdpe(va);
 			MPASS(*pdpe == 0);
 			*pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW |
 			    X86_PG_V | X86_PG_A | pg_nx |
 			    pmap_cache_bits(kernel_pmap, mattr, TRUE);
 			inc = NBPDP;
 		} else if (len >= NBPDR && (pa & PDRMASK) == 0 &&
 		    (va & PDRMASK) == 0) {
 			pde = pmap_large_map_pde(va);
 			MPASS(*pde == 0);
 			*pde = pa | pg_g | X86_PG_PS | X86_PG_RW |
 			    X86_PG_V | X86_PG_A | pg_nx |
 			    pmap_cache_bits(kernel_pmap, mattr, TRUE);
 			PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->
 			    ref_count++;
 			inc = NBPDR;
 		} else {
 			pte = pmap_large_map_pte(va);
 			MPASS(*pte == 0);
 			*pte = pa | pg_g | X86_PG_RW | X86_PG_V |
 			    X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap,
 			    mattr, FALSE);
 			PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))->
 			    ref_count++;
 			inc = PAGE_SIZE;
 		}
 	}
 	PMAP_UNLOCK(kernel_pmap);
 	MPASS(len == 0);
 
 	*addr = (void *)vmem_res;
 	return (0);
 }
 
 void
 pmap_large_unmap(void *svaa, vm_size_t len)
 {
 	vm_offset_t sva, va;
 	vm_size_t inc;
 	pdp_entry_t *pdpe, pdp;
 	pd_entry_t *pde, pd;
 	pt_entry_t *pte;
 	vm_page_t m;
 	struct spglist spgf;
 
 	sva = (vm_offset_t)svaa;
 	if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
 	    sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
 		return;
 
 	SLIST_INIT(&spgf);
 	KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) &&
 	    PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1),
 	    ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len));
 	PMAP_LOCK(kernel_pmap);
 	for (va = sva; va < sva + len; va += inc) {
 		pdpe = pmap_large_map_pdpe(va);
 		pdp = *pdpe;
 		KASSERT((pdp & X86_PG_V) != 0,
 		    ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
 		    (u_long)pdpe, pdp));
 		if ((pdp & X86_PG_PS) != 0) {
 			KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
 			    ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
 			    (u_long)pdpe, pdp));
 			KASSERT((va & PDPMASK) == 0,
 			    ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va,
 			    (u_long)pdpe, pdp));
 			KASSERT(va + NBPDP <= sva + len,
 			    ("unmap covers partial 1GB page, sva %#lx va %#lx "
 			    "pdpe %#lx pdp %#lx len %#lx", sva, va,
 			    (u_long)pdpe, pdp, len));
 			*pdpe = 0;
 			inc = NBPDP;
 			continue;
 		}
 		pde = pmap_pdpe_to_pde(pdpe, va);
 		pd = *pde;
 		KASSERT((pd & X86_PG_V) != 0,
 		    ("invalid pd va %#lx pde %#lx pd %#lx", va,
 		    (u_long)pde, pd));
 		if ((pd & X86_PG_PS) != 0) {
 			KASSERT((va & PDRMASK) == 0,
 			    ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va,
 			    (u_long)pde, pd));
 			KASSERT(va + NBPDR <= sva + len,
 			    ("unmap covers partial 2MB page, sva %#lx va %#lx "
 			    "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde,
 			    pd, len));
 			pde_store(pde, 0);
 			inc = NBPDR;
 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
 			m->ref_count--;
 			if (m->ref_count == 0) {
 				*pdpe = 0;
 				SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 			}
 			continue;
 		}
 		pte = pmap_pde_to_pte(pde, va);
 		KASSERT((*pte & X86_PG_V) != 0,
 		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
 		    (u_long)pte, *pte));
 		pte_clear(pte);
 		inc = PAGE_SIZE;
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte));
 		m->ref_count--;
 		if (m->ref_count == 0) {
 			*pde = 0;
 			SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
 			m->ref_count--;
 			if (m->ref_count == 0) {
 				*pdpe = 0;
 				SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 			}
 		}
 	}
 	pmap_invalidate_range(kernel_pmap, sva, sva + len);
 	PMAP_UNLOCK(kernel_pmap);
 	vm_page_free_pages_toq(&spgf, false);
 	vmem_free(large_vmem, sva, len);
 }
 
 static void
 pmap_large_map_wb_fence_mfence(void)
 {
 
 	mfence();
 }
 
 static void
 pmap_large_map_wb_fence_atomic(void)
 {
 
 	atomic_thread_fence_seq_cst();
 }
 
 static void
 pmap_large_map_wb_fence_nop(void)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void))
 {
 
 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
 		return (pmap_large_map_wb_fence_mfence);
 	else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB |
 	    CPUID_STDEXT_CLFLUSHOPT)) == 0)
 		return (pmap_large_map_wb_fence_atomic);
 	else
 		/* clflush is strongly enough ordered */
 		return (pmap_large_map_wb_fence_nop);
 }
 
 static void
 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)
 {
 
 	for (; len > 0; len -= cpu_clflush_line_size,
 	    va += cpu_clflush_line_size)
 		clwb(va);
 }
 
 static void
 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len)
 {
 
 	for (; len > 0; len -= cpu_clflush_line_size,
 	    va += cpu_clflush_line_size)
 		clflushopt(va);
 }
 
 static void
 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len)
 {
 
 	for (; len > 0; len -= cpu_clflush_line_size,
 	    va += cpu_clflush_line_size)
 		clflush(va);
 }
 
 static void
 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t))
 {
 
 	if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0)
 		return (pmap_large_map_flush_range_clwb);
 	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0)
 		return (pmap_large_map_flush_range_clflushopt);
 	else if ((cpu_feature & CPUID_CLFSH) != 0)
 		return (pmap_large_map_flush_range_clflush);
 	else
 		return (pmap_large_map_flush_range_nop);
 }
 
 static void
 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva)
 {
 	volatile u_long *pe;
 	u_long p;
 	vm_offset_t va;
 	vm_size_t inc;
 	bool seen_other;
 
 	for (va = sva; va < eva; va += inc) {
 		inc = 0;
 		if ((amd_feature & AMDID_PAGE1GB) != 0) {
 			pe = (volatile u_long *)pmap_large_map_pdpe(va);
 			p = *pe;
 			if ((p & X86_PG_PS) != 0)
 				inc = NBPDP;
 		}
 		if (inc == 0) {
 			pe = (volatile u_long *)pmap_large_map_pde(va);
 			p = *pe;
 			if ((p & X86_PG_PS) != 0)
 				inc = NBPDR;
 		}
 		if (inc == 0) {
 			pe = (volatile u_long *)pmap_large_map_pte(va);
 			p = *pe;
 			inc = PAGE_SIZE;
 		}
 		seen_other = false;
 		for (;;) {
 			if ((p & X86_PG_AVAIL1) != 0) {
 				/*
 				 * Spin-wait for the end of a parallel
 				 * write-back.
 				 */
 				cpu_spinwait();
 				p = *pe;
 
 				/*
 				 * If we saw other write-back
 				 * occuring, we cannot rely on PG_M to
 				 * indicate state of the cache.  The
 				 * PG_M bit is cleared before the
 				 * flush to avoid ignoring new writes,
 				 * and writes which are relevant for
 				 * us might happen after.
 				 */
 				seen_other = true;
 				continue;
 			}
 
 			if ((p & X86_PG_M) != 0 || seen_other) {
 				if (!atomic_fcmpset_long(pe, &p,
 				    (p & ~X86_PG_M) | X86_PG_AVAIL1))
 					/*
 					 * If we saw PG_M without
 					 * PG_AVAIL1, and then on the
 					 * next attempt we do not
 					 * observe either PG_M or
 					 * PG_AVAIL1, the other
 					 * write-back started after us
 					 * and finished before us.  We
 					 * can rely on it doing our
 					 * work.
 					 */
 					continue;
 				pmap_large_map_flush_range(va, inc);
 				atomic_clear_long(pe, X86_PG_AVAIL1);
 			}
 			break;
 		}
 		maybe_yield();
 	}
 }
 
 /*
  * Write-back cache lines for the given address range.
  *
  * Must be called only on the range or sub-range returned from
  * pmap_large_map().  Must not be called on the coalesced ranges.
  *
  * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH
  * instructions support.
  */
 void
 pmap_large_map_wb(void *svap, vm_size_t len)
 {
 	vm_offset_t eva, sva;
 
 	sva = (vm_offset_t)svap;
 	eva = sva + len;
 	pmap_large_map_wb_fence();
 	if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
 		pmap_large_map_flush_range(sva, len);
 	} else {
 		KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
 		    eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
 		    ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
 		pmap_large_map_wb_large(sva, eva);
 	}
 	pmap_large_map_wb_fence();
 }
 
 static vm_page_t
 pmap_pti_alloc_page(void)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 	return (m);
 }
 
 static bool
 pmap_pti_free_page(vm_page_t m)
 {
 
 	KASSERT(m->ref_count > 0, ("page %p not referenced", m));
 	if (!vm_page_unwire_noq(m))
 		return (false);
 	vm_page_free_zero(m);
 	return (true);
 }
 
 static void
 pmap_pti_init(void)
 {
 	vm_page_t pml4_pg;
 	pdp_entry_t *pdpe;
 	vm_offset_t va;
 	int i;
 
 	if (!pti)
 		return;
 	pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
 	VM_OBJECT_WLOCK(pti_obj);
 	pml4_pg = pmap_pti_alloc_page();
 	pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
 	for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
 	    va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
 		pdpe = pmap_pti_pdpe(va);
 		pmap_pti_wire_pte(pdpe);
 	}
 	pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
 	    (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
 	pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
 	    sizeof(struct gate_descriptor) * NIDT, false);
 	CPU_FOREACH(i) {
 		/* Doublefault stack IST 1 */
 		va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false);
 		/* NMI stack IST 2 */
 		va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false);
 		/* MC# stack IST 3 */
 		va = __pcpu[i].pc_common_tss.tss_ist3 +
 		    sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false);
 		/* DB# stack IST 4 */
 		va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false);
 	}
 	pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext,
 	    true);
 	pti_finalized = true;
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
 
 static pdp_entry_t *
 pmap_pti_pdpe(vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	vm_page_t m;
 	vm_pindex_t pml4_idx;
 	vm_paddr_t mphys;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	pml4_idx = pmap_pml4e_index(va);
 	pml4e = &pti_pml4[pml4_idx];
 	m = NULL;
 	if (*pml4e == 0) {
 		if (pti_finalized)
 			panic("pml4 alloc after finalization\n");
 		m = pmap_pti_alloc_page();
 		if (*pml4e != 0) {
 			pmap_pti_free_page(m);
 			mphys = *pml4e & ~PAGE_MASK;
 		} else {
 			mphys = VM_PAGE_TO_PHYS(m);
 			*pml4e = mphys | X86_PG_RW | X86_PG_V;
 		}
 	} else {
 		mphys = *pml4e & ~PAGE_MASK;
 	}
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
 	return (pdpe);
 }
 
 static void
 pmap_pti_wire_pte(void *pte)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
 	m->ref_count++;
 }
 
 static void
 pmap_pti_unwire_pde(void *pde, bool only_ref)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
 	MPASS(m->ref_count > 0);
 	MPASS(only_ref || m->ref_count > 1);
 	pmap_pti_free_page(m);
 }
 
 static void
 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
 {
 	vm_page_t m;
 	pd_entry_t *pde;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
 	MPASS(m->ref_count > 0);
 	if (pmap_pti_free_page(m)) {
 		pde = pmap_pti_pde(va);
 		MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
 		*pde = 0;
 		pmap_pti_unwire_pde(pde, false);
 	}
 }
 
 static pd_entry_t *
 pmap_pti_pde(vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	vm_page_t m;
 	vm_pindex_t pd_idx;
 	vm_paddr_t mphys;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	pdpe = pmap_pti_pdpe(va);
 	if (*pdpe == 0) {
 		m = pmap_pti_alloc_page();
 		if (*pdpe != 0) {
 			pmap_pti_free_page(m);
 			MPASS((*pdpe & X86_PG_PS) == 0);
 			mphys = *pdpe & ~PAGE_MASK;
 		} else {
 			mphys =  VM_PAGE_TO_PHYS(m);
 			*pdpe = mphys | X86_PG_RW | X86_PG_V;
 		}
 	} else {
 		MPASS((*pdpe & X86_PG_PS) == 0);
 		mphys = *pdpe & ~PAGE_MASK;
 	}
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
 	pd_idx = pmap_pde_index(va);
 	pde += pd_idx;
 	return (pde);
 }
 
 static pt_entry_t *
 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_page_t m;
 	vm_paddr_t mphys;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	pde = pmap_pti_pde(va);
 	if (unwire_pde != NULL) {
 		*unwire_pde = true;
 		pmap_pti_wire_pte(pde);
 	}
 	if (*pde == 0) {
 		m = pmap_pti_alloc_page();
 		if (*pde != 0) {
 			pmap_pti_free_page(m);
 			MPASS((*pde & X86_PG_PS) == 0);
 			mphys = *pde & ~(PAGE_MASK | pg_nx);
 		} else {
 			mphys = VM_PAGE_TO_PHYS(m);
 			*pde = mphys | X86_PG_RW | X86_PG_V;
 			if (unwire_pde != NULL)
 				*unwire_pde = false;
 		}
 	} else {
 		MPASS((*pde & X86_PG_PS) == 0);
 		mphys = *pde & ~(PAGE_MASK | pg_nx);
 	}
 
 	pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
 	pte += pmap_pte_index(va);
 
 	return (pte);
 }
 
 static void
 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
 {
 	vm_paddr_t pa;
 	pd_entry_t *pde;
 	pt_entry_t *pte, ptev;
 	bool unwire_pde;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	sva = trunc_page(sva);
 	MPASS(sva > VM_MAXUSER_ADDRESS);
 	eva = round_page(eva);
 	MPASS(sva < eva);
 	for (; sva < eva; sva += PAGE_SIZE) {
 		pte = pmap_pti_pte(sva, &unwire_pde);
 		pa = pmap_kextract(sva);
 		ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G |
 		    (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
 		    VM_MEMATTR_DEFAULT, FALSE);
 		if (*pte == 0) {
 			pte_store(pte, ptev);
 			pmap_pti_wire_pte(pte);
 		} else {
 			KASSERT(!pti_finalized,
 			    ("pti overlap after fin %#lx %#lx %#lx",
 			    sva, *pte, ptev));
 			KASSERT(*pte == ptev,
 			    ("pti non-identical pte after fin %#lx %#lx %#lx",
 			    sva, *pte, ptev));
 		}
 		if (unwire_pde) {
 			pde = pmap_pti_pde(sva);
 			pmap_pti_unwire_pde(pde, true);
 		}
 	}
 }
 
 void
 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
 {
 
 	if (!pti)
 		return;
 	VM_OBJECT_WLOCK(pti_obj);
 	pmap_pti_add_kva_locked(sva, eva, exec);
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
 
 void
 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 
 	if (!pti)
 		return;
 	sva = rounddown2(sva, PAGE_SIZE);
 	MPASS(sva > VM_MAXUSER_ADDRESS);
 	eva = roundup2(eva, PAGE_SIZE);
 	MPASS(sva < eva);
 	VM_OBJECT_WLOCK(pti_obj);
 	for (va = sva; va < eva; va += PAGE_SIZE) {
 		pte = pmap_pti_pte(va, NULL);
 		KASSERT((*pte & X86_PG_V) != 0,
 		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
 		    (u_long)pte, *pte));
 		pte_clear(pte);
 		pmap_pti_unwire_pte(pte, va);
 	}
 	pmap_invalidate_range(kernel_pmap, sva, eva);
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
 
 static void *
 pkru_dup_range(void *ctx __unused, void *data)
 {
 	struct pmap_pkru_range *node, *new_node;
 
 	new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
 	if (new_node == NULL)
 		return (NULL);
 	node = data;
 	memcpy(new_node, node, sizeof(*node));
 	return (new_node);
 }
 
 static void
 pkru_free_range(void *ctx __unused, void *node)
 {
 
 	uma_zfree(pmap_pkru_ranges_zone, node);
 }
 
 static int
 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
     int flags)
 {
 	struct pmap_pkru_range *ppr;
 	int error;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	MPASS(pmap->pm_type == PT_X86);
 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 	if ((flags & AMD64_PKRU_EXCL) != 0 &&
 	    !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
 		return (EBUSY);
 	ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
 	if (ppr == NULL)
 		return (ENOMEM);
 	ppr->pkru_keyidx = keyidx;
 	ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
 	error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
 	if (error != 0)
 		uma_zfree(pmap_pkru_ranges_zone, ppr);
 	return (error);
 }
 
 static int
 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	MPASS(pmap->pm_type == PT_X86);
 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 	return (rangeset_remove(&pmap->pm_pkru, sva, eva));
 }
 
 static void
 pmap_pkru_deassign_all(pmap_t pmap)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type == PT_X86 &&
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
 		rangeset_remove_all(&pmap->pm_pkru);
 }
 
 static bool
 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct pmap_pkru_range *ppr, *prev_ppr;
 	vm_offset_t va;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type != PT_X86 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
 	    sva >= VM_MAXUSER_ADDRESS)
 		return (true);
 	MPASS(eva <= VM_MAXUSER_ADDRESS);
 	for (va = sva; va < eva; prev_ppr = ppr) {
 		ppr = rangeset_lookup(&pmap->pm_pkru, va);
 		if (va == sva)
 			prev_ppr = ppr;
 		else if ((ppr == NULL) ^ (prev_ppr == NULL))
 			return (false);
 		if (ppr == NULL) {
 			va += PAGE_SIZE;
 			continue;
 		}
 		if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx)
 			return (false);
 		va = ppr->pkru_rs_el.re_end;
 	}
 	return (true);
 }
 
 static pt_entry_t
 pmap_pkru_get(pmap_t pmap, vm_offset_t va)
 {
 	struct pmap_pkru_range *ppr;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type != PT_X86 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
 	    va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	ppr = rangeset_lookup(&pmap->pm_pkru, va);
 	if (ppr != NULL)
 		return (X86_PG_PKU(ppr->pkru_keyidx));
 	return (0);
 }
 
 static bool
 pred_pkru_on_remove(void *ctx __unused, void *r)
 {
 	struct pmap_pkru_range *ppr;
 
 	ppr = r;
 	return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
 }
 
 static void
 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type == PT_X86 &&
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 		rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
 		    pred_pkru_on_remove);
 	}
 }
 
 static int
 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
 {
 
 	PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
 	PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
 	MPASS(dst_pmap->pm_type == PT_X86);
 	MPASS(src_pmap->pm_type == PT_X86);
 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 	if (src_pmap->pm_pkru.rs_data_ctx == NULL)
 		return (0);
 	return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
 }
 
 static void
 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     u_int keyidx)
 {
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t newpde, ptpaddr, *pde;
 	pt_entry_t newpte, *ptep, pte;
 	vm_offset_t va, va_next;
 	bool changed;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	MPASS(pmap->pm_type == PT_X86);
 	MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
 
 	for (changed = false, va = sva; va < eva; va = va_next) {
 		pml4e = pmap_pml4e(pmap, va);
 		if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) {
 			va_next = (va + NBPML4) & ~PML4MASK;
 			if (va_next < va)
 				va_next = eva;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, va);
 		if ((*pdpe & X86_PG_V) == 0) {
 			va_next = (va + NBPDP) & ~PDPMASK;
 			if (va_next < va)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (va + NBPDR) & ~PDRMASK;
 		if (va_next < va)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, va);
 		ptpaddr = *pde;
 		if (ptpaddr == 0)
 			continue;
 
 		MPASS((ptpaddr & X86_PG_V) != 0);
 		if ((ptpaddr & PG_PS) != 0) {
 			if (va + NBPDR == va_next && eva >= va_next) {
 				newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
 				    X86_PG_PKU(keyidx);
 				if (newpde != ptpaddr) {
 					*pde = newpde;
 					changed = true;
 				}
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, va)) {
 				continue;
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (ptep = pmap_pde_to_pte(pde, va); va != va_next;
 		    ptep++, va += PAGE_SIZE) {
 			pte = *ptep;
 			if ((pte & X86_PG_V) == 0)
 				continue;
 			newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
 			if (newpte != pte) {
 				*ptep = newpte;
 				changed = true;
 			}
 		}
 	}
 	if (changed)
 		pmap_invalidate_range(pmap, sva, eva);
 }
 
 static int
 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     u_int keyidx, int flags)
 {
 
 	if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
 	    (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
 		return (EINVAL);
 	if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
 		return (EFAULT);
 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
 		return (ENOTSUP);
 	return (0);
 }
 
 int
 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
     int flags)
 {
 	int error;
 
 	sva = trunc_page(sva);
 	eva = round_page(eva);
 	error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
 	if (error != 0)
 		return (error);
 	for (;;) {
 		PMAP_LOCK(pmap);
 		error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
 		if (error == 0)
 			pmap_pkru_update_range(pmap, sva, eva, keyidx);
 		PMAP_UNLOCK(pmap);
 		if (error != ENOMEM)
 			break;
 		vm_wait(NULL);
 	}
 	return (error);
 }
 
 int
 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	int error;
 
 	sva = trunc_page(sva);
 	eva = round_page(eva);
 	error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
 	if (error != 0)
 		return (error);
 	for (;;) {
 		PMAP_LOCK(pmap);
 		error = pmap_pkru_deassign(pmap, sva, eva);
 		if (error == 0)
 			pmap_pkru_update_range(pmap, sva, eva, 0);
 		PMAP_UNLOCK(pmap);
 		if (error != ENOMEM)
 			break;
 		vm_wait(NULL);
 	}
 	return (error);
 }
 
 #ifdef KASAN
 static vm_page_t
 pmap_kasan_enter_alloc_4k(void)
 {
 	vm_page_t m;
 
 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
 	    VM_ALLOC_ZERO);
 	if (m == NULL)
 		panic("%s: no memory to grow shadow map", __func__);
 	return (m);
 }
 
 static vm_page_t
 pmap_kasan_enter_alloc_2m(void)
 {
-	vm_page_t m;
-
-	m = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
-	    VM_ALLOC_WIRED, NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT);
-	if (m != NULL)
-		memset((void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 0, NBPDR);
-	return (m);
+	return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
+	    NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT));
 }
 
 /*
  * Grow the shadow map by at least one 4KB page at the specified address.  Use
  * 2MB pages when possible.
  */
 void
 pmap_kasan_enter(vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_page_t m;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	pdpe = pmap_pdpe(kernel_pmap, va);
 	if ((*pdpe & X86_PG_V) == 0) {
 		m = pmap_kasan_enter_alloc_4k();
 		*pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
 		    X86_PG_V | pg_nx);
 	}
 	pde = pmap_pdpe_to_pde(pdpe, va);
 	if ((*pde & X86_PG_V) == 0) {
 		m = pmap_kasan_enter_alloc_2m();
 		if (m != NULL) {
 			*pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
 			    X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx);
 		} else {
 			m = pmap_kasan_enter_alloc_4k();
 			*pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
 			    X86_PG_V | pg_nx);
 		}
 	}
 	if ((*pde & X86_PG_PS) != 0)
 		return;
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & X86_PG_V) != 0)
 		return;
 	m = pmap_kasan_enter_alloc_4k();
 	*pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V |
 	    X86_PG_M | X86_PG_A | pg_nx);
 }
 #endif
 
 #ifdef KMSAN
 static vm_page_t
 pmap_kmsan_enter_alloc_4k(void)
 {
 	vm_page_t m;
 
 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
 	    VM_ALLOC_ZERO);
 	if (m == NULL)
 		panic("%s: no memory to grow shadow map", __func__);
 	return (m);
 }
 
 static vm_page_t
 pmap_kmsan_enter_alloc_2m(void)
 {
-	vm_page_t m;
-
-	m = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
-	    VM_ALLOC_WIRED, NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT);
-	if (m != NULL)
-		memset((void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 0, NBPDR);
-	return (m);
+	return (vm_page_alloc_noobj_contig(VM_ALLOC_ZERO | VM_ALLOC_WIRED,
+	    NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT));
 }
 
 /*
  * Grow the shadow or origin maps by at least one 4KB page at the specified
  * address.  Use 2MB pages when possible.
  */
 void
 pmap_kmsan_enter(vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_page_t m;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	pdpe = pmap_pdpe(kernel_pmap, va);
 	if ((*pdpe & X86_PG_V) == 0) {
 		m = pmap_kmsan_enter_alloc_4k();
 		*pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
 		    X86_PG_V | pg_nx);
 	}
 	pde = pmap_pdpe_to_pde(pdpe, va);
 	if ((*pde & X86_PG_V) == 0) {
 		m = pmap_kmsan_enter_alloc_2m();
 		if (m != NULL) {
 			*pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
 			    X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx);
 		} else {
 			m = pmap_kmsan_enter_alloc_4k();
 			*pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
 			    X86_PG_V | pg_nx);
 		}
 	}
 	if ((*pde & X86_PG_PS) != 0)
 		return;
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & X86_PG_V) != 0)
 		return;
 	m = pmap_kmsan_enter_alloc_4k();
 	*pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V |
 	    X86_PG_M | X86_PG_A | pg_nx);
 }
 #endif
 
 /*
  * Track a range of the kernel's virtual address space that is contiguous
  * in various mapping attributes.
  */
 struct pmap_kernel_map_range {
 	vm_offset_t sva;
 	pt_entry_t attrs;
 	int ptes;
 	int pdes;
 	int pdpes;
 };
 
 static void
 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
     vm_offset_t eva)
 {
 	const char *mode;
 	int i, pat_idx;
 
 	if (eva <= range->sva)
 		return;
 
 	pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true);
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		if (pat_index[i] == pat_idx)
 			break;
 
 	switch (i) {
 	case PAT_WRITE_BACK:
 		mode = "WB";
 		break;
 	case PAT_WRITE_THROUGH:
 		mode = "WT";
 		break;
 	case PAT_UNCACHEABLE:
 		mode = "UC";
 		break;
 	case PAT_UNCACHED:
 		mode = "U-";
 		break;
 	case PAT_WRITE_PROTECTED:
 		mode = "WP";
 		break;
 	case PAT_WRITE_COMBINING:
 		mode = "WC";
 		break;
 	default:
 		printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n",
 		    __func__, pat_idx, range->sva, eva);
 		mode = "??";
 		break;
 	}
 
 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
 	    range->sva, eva,
 	    (range->attrs & X86_PG_RW) != 0 ? 'w' : '-',
 	    (range->attrs & pg_nx) != 0 ? '-' : 'x',
 	    (range->attrs & X86_PG_U) != 0 ? 'u' : 's',
 	    (range->attrs & X86_PG_G) != 0 ? 'g' : '-',
 	    mode, range->pdpes, range->pdes, range->ptes);
 
 	/* Reset to sentinel value. */
 	range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
 	    NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
 	    NPDEPG - 1, NPTEPG - 1);
 }
 
 /*
  * Determine whether the attributes specified by a page table entry match those
  * being tracked by the current range.  This is not quite as simple as a direct
  * flag comparison since some PAT modes have multiple representations.
  */
 static bool
 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
 {
 	pt_entry_t diff, mask;
 
 	mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx;
 	diff = (range->attrs ^ attrs) & mask;
 	if (diff == 0)
 		return (true);
 	if ((diff & ~X86_PG_PDE_PAT) == 0 &&
 	    pmap_pat_index(kernel_pmap, range->attrs, true) ==
 	    pmap_pat_index(kernel_pmap, attrs, true))
 		return (true);
 	return (false);
 }
 
 static void
 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
     pt_entry_t attrs)
 {
 
 	memset(range, 0, sizeof(*range));
 	range->sva = va;
 	range->attrs = attrs;
 }
 
 /*
  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
  * those of the current run, dump the address range and its attributes, and
  * begin a new run.
  */
 static void
 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
     vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
     pt_entry_t pte)
 {
 	pt_entry_t attrs;
 
 	attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
 
 	attrs |= pdpe & pg_nx;
 	attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
 	if ((pdpe & PG_PS) != 0) {
 		attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE);
 	} else if (pde != 0) {
 		attrs |= pde & pg_nx;
 		attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U));
 	}
 	if ((pde & PG_PS) != 0) {
 		attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE);
 	} else if (pte != 0) {
 		attrs |= pte & pg_nx;
 		attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U));
 		attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE);
 
 		/* Canonicalize by always using the PDE PAT bit. */
 		if ((attrs & X86_PG_PTE_PAT) != 0)
 			attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT;
 	}
 
 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
 		sysctl_kmaps_dump(sb, range, va);
 		sysctl_kmaps_reinit(range, va, attrs);
 	}
 }
 
 static int
 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 {
 	struct pmap_kernel_map_range range;
 	struct sbuf sbuf, *sb;
 	pml4_entry_t pml4e;
 	pdp_entry_t *pdp, pdpe;
 	pd_entry_t *pd, pde;
 	pt_entry_t *pt, pte;
 	vm_offset_t sva;
 	vm_paddr_t pa;
 	int error, i, j, k, l;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = &sbuf;
 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
 
 	/* Sentinel value. */
 	range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
 	    NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
 	    NPDEPG - 1, NPTEPG - 1);
 
 	/*
 	 * Iterate over the kernel page tables without holding the kernel pmap
 	 * lock.  Outside of the large map, kernel page table pages are never
 	 * freed, so at worst we will observe inconsistencies in the output.
 	 * Within the large map, ensure that PDP and PD page addresses are
 	 * valid before descending.
 	 */
 	for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
 		switch (i) {
 		case PML4PML4I:
 			sbuf_printf(sb, "\nRecursive map:\n");
 			break;
 		case DMPML4I:
 			sbuf_printf(sb, "\nDirect map:\n");
 			break;
 #ifdef KASAN
 		case KASANPML4I:
 			sbuf_printf(sb, "\nKASAN shadow map:\n");
 			break;
 #endif
 #ifdef KMSAN
 		case KMSANSHADPML4I:
 			sbuf_printf(sb, "\nKMSAN shadow map:\n");
 			break;
 		case KMSANORIGPML4I:
 			sbuf_printf(sb, "\nKMSAN origin map:\n");
 			break;
 #endif
 		case KPML4BASE:
 			sbuf_printf(sb, "\nKernel map:\n");
 			break;
 		case LMSPML4I:
 			sbuf_printf(sb, "\nLarge map:\n");
 			break;
 		}
 
 		/* Convert to canonical form. */
 		if (sva == 1ul << 47)
 			sva |= -1ul << 48;
 
 restart:
 		pml4e = kernel_pml4[i];
 		if ((pml4e & X86_PG_V) == 0) {
 			sva = rounddown2(sva, NBPML4);
 			sysctl_kmaps_dump(sb, &range, sva);
 			sva += NBPML4;
 			continue;
 		}
 		pa = pml4e & PG_FRAME;
 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa);
 
 		for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) {
 			pdpe = pdp[j];
 			if ((pdpe & X86_PG_V) == 0) {
 				sva = rounddown2(sva, NBPDP);
 				sysctl_kmaps_dump(sb, &range, sva);
 				sva += NBPDP;
 				continue;
 			}
 			pa = pdpe & PG_FRAME;
 			if ((pdpe & PG_PS) != 0) {
 				sva = rounddown2(sva, NBPDP);
 				sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
 				    0, 0);
 				range.pdpes++;
 				sva += NBPDP;
 				continue;
 			}
 			if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
 			    vm_phys_paddr_to_vm_page(pa) == NULL) {
 				/*
 				 * Page table pages for the large map may be
 				 * freed.  Validate the next-level address
 				 * before descending.
 				 */
 				goto restart;
 			}
 			pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
 
 			for (k = pmap_pde_index(sva); k < NPDEPG; k++) {
 				pde = pd[k];
 				if ((pde & X86_PG_V) == 0) {
 					sva = rounddown2(sva, NBPDR);
 					sysctl_kmaps_dump(sb, &range, sva);
 					sva += NBPDR;
 					continue;
 				}
 				pa = pde & PG_FRAME;
 				if ((pde & PG_PS) != 0) {
 					sva = rounddown2(sva, NBPDR);
 					sysctl_kmaps_check(sb, &range, sva,
 					    pml4e, pdpe, pde, 0);
 					range.pdes++;
 					sva += NBPDR;
 					continue;
 				}
 				if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
 				    vm_phys_paddr_to_vm_page(pa) == NULL) {
 					/*
 					 * Page table pages for the large map
 					 * may be freed.  Validate the
 					 * next-level address before descending.
 					 */
 					goto restart;
 				}
 				pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
 
 				for (l = pmap_pte_index(sva); l < NPTEPG; l++,
 				    sva += PAGE_SIZE) {
 					pte = pt[l];
 					if ((pte & X86_PG_V) == 0) {
 						sysctl_kmaps_dump(sb, &range,
 						    sva);
 						continue;
 					}
 					sysctl_kmaps_check(sb, &range, sva,
 					    pml4e, pdpe, pde, pte);
 					range.ptes++;
 				}
 			}
 		}
 	}
 
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (error);
 }
 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
     NULL, 0, sysctl_kmaps, "A",
     "Dump kernel address layout");
 
 #ifdef DDB
 DB_SHOW_COMMAND(pte, pmap_print_pte)
 {
 	pmap_t pmap;
 	pml5_entry_t *pml5;
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	vm_offset_t va;
 
 	if (!have_addr) {
 		db_printf("show pte addr\n");
 		return;
 	}
 	va = (vm_offset_t)addr;
 
 	if (kdb_thread != NULL)
 		pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
 	else
 		pmap = PCPU_GET(curpmap);
 
 	PG_V = pmap_valid_bit(pmap);
 	db_printf("VA 0x%016lx", va);
 
 	if (pmap_is_la57(pmap)) {
 		pml5 = pmap_pml5e(pmap, va);
 		db_printf(" pml5e 0x%016lx", *pml5);
 		if ((*pml5 & PG_V) == 0) {
 			db_printf("\n");
 			return;
 		}
 		pml4 = pmap_pml5e_to_pml4e(pml5, va);
 	} else {
 		pml4 = pmap_pml4e(pmap, va);
 	}
 	db_printf(" pml4e 0x%016lx", *pml4);
 	if ((*pml4 & PG_V) == 0) {
 		db_printf("\n");
 		return;
 	}
 	pdp = pmap_pml4e_to_pdpe(pml4, va);
 	db_printf(" pdpe 0x%016lx", *pdp);
 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
 		db_printf("\n");
 		return;
 	}
 	pde = pmap_pdpe_to_pde(pdp, va);
 	db_printf(" pde 0x%016lx", *pde);
 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
 		db_printf("\n");
 		return;
 	}
 	pte = pmap_pde_to_pte(pde, va);
 	db_printf(" pte 0x%016lx\n", *pte);
 }
 
 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
 {
 	vm_paddr_t a;
 
 	if (have_addr) {
 		a = (vm_paddr_t)addr;
 		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
 	} else {
 		db_printf("show phys2dmap addr\n");
 	}
 }
 
 static void
 ptpages_show_page(int level, int idx, vm_page_t pg)
 {
 	db_printf("l %d i %d pg %p phys %#lx ref %x\n",
 	    level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count);
 }
 
 static void
 ptpages_show_complain(int level, int idx, uint64_t pte)
 {
 	db_printf("l %d i %d pte %#lx\n", level, idx, pte);
 }
 
 static void
 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V)
 {
 	vm_page_t pg3, pg2, pg1;
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pd;
 	int i4, i3, i2;
 
 	pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4));
 	for (i4 = 0; i4 < num_entries; i4++) {
 		if ((pml4[i4] & PG_V) == 0)
 			continue;
 		pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME);
 		if (pg3 == NULL) {
 			ptpages_show_complain(3, i4, pml4[i4]);
 			continue;
 		}
 		ptpages_show_page(3, i4, pg3);
 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3));
 		for (i3 = 0; i3 < NPDPEPG; i3++) {
 			if ((pdp[i3] & PG_V) == 0)
 				continue;
 			pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME);
 			if (pg3 == NULL) {
 				ptpages_show_complain(2, i3, pdp[i3]);
 				continue;
 			}
 			ptpages_show_page(2, i3, pg2);
 			pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2));
 			for (i2 = 0; i2 < NPDEPG; i2++) {
 				if ((pd[i2] & PG_V) == 0)
 					continue;
 				pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME);
 				if (pg1 == NULL) {
 					ptpages_show_complain(1, i2, pd[i2]);
 					continue;
 				}
 				ptpages_show_page(1, i2, pg1);
 			}
 		}
 	}
 }
 
 DB_SHOW_COMMAND(ptpages, pmap_ptpages)
 {
 	pmap_t pmap;
 	vm_page_t pg;
 	pml5_entry_t *pml5;
 	uint64_t PG_V;
 	int i5;
 
 	if (have_addr)
 		pmap = (pmap_t)addr;
 	else
 		pmap = PCPU_GET(curpmap);
 
 	PG_V = pmap_valid_bit(pmap);
 
 	if (pmap_is_la57(pmap)) {
 		pml5 = pmap->pm_pmltop;
 		for (i5 = 0; i5 < NUPML5E; i5++) {
 			if ((pml5[i5] & PG_V) == 0)
 				continue;
 			pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME);
 			if (pg == NULL) {
 				ptpages_show_complain(4, i5, pml5[i5]);
 				continue;
 			}
 			ptpages_show_page(4, i5, pg);
 			ptpages_show_pml4(pg, NPML4EPG, PG_V);
 		}
 	} else {
 		ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS(
 		    (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V);
 	}
 }
 #endif
diff --git a/sys/arm/nvidia/drm2/tegra_bo.c b/sys/arm/nvidia/drm2/tegra_bo.c
index 7479fd8bc8da..be5177973f4f 100644
--- a/sys/arm/nvidia/drm2/tegra_bo.c
+++ b/sys/arm/nvidia/drm2/tegra_bo.c
@@ -1,366 +1,362 @@
 /*-
  * Copyright (c) 2015 Michal Meloun
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 
 #include <machine/bus.h>
 
 #include <dev/extres/clk/clk.h>
 #include <dev/drm2/drmP.h>
 #include <dev/drm2/drm_crtc_helper.h>
 #include <dev/drm2/drm_fb_helper.h>
 
 #include <arm/nvidia/drm2/tegra_drm.h>
 
 #include <sys/vmem.h>
 #include <sys/vmem.h>
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 
 static void
 tegra_bo_destruct(struct tegra_bo *bo)
 {
 	vm_page_t m;
 	size_t size;
 	int i;
 
 	if (bo->cdev_pager == NULL)
 		return;
 
 	size = round_page(bo->gem_obj.size);
 	if (bo->vbase != 0)
 		pmap_qremove(bo->vbase, bo->npages);
 
 	VM_OBJECT_WLOCK(bo->cdev_pager);
 	for (i = 0; i < bo->npages; i++) {
 		m = bo->m[i];
 		vm_page_busy_acquire(m, 0);
 		cdev_pager_free_page(bo->cdev_pager, m);
 		m->flags &= ~PG_FICTITIOUS;
 		vm_page_unwire_noq(m);
 		vm_page_free(m);
 	}
 	VM_OBJECT_WUNLOCK(bo->cdev_pager);
 
 	vm_object_deallocate(bo->cdev_pager);
 	if (bo->vbase != 0)
 		vmem_free(kmem_arena, bo->vbase, size);
 }
 
 static void
 tegra_bo_free_object(struct drm_gem_object *gem_obj)
 {
 	struct tegra_bo *bo;
 
 	bo = container_of(gem_obj, struct tegra_bo, gem_obj);
 	drm_gem_free_mmap_offset(gem_obj);
 	drm_gem_object_release(gem_obj);
 
 	tegra_bo_destruct(bo);
 
 	free(bo->m, DRM_MEM_DRIVER);
 	free(bo, DRM_MEM_DRIVER);
 }
 
 static int
 tegra_bo_alloc_contig(size_t npages, u_long alignment, vm_memattr_t memattr,
     vm_page_t **ret_page)
 {
 	vm_page_t m;
-	int pflags, tries, i;
+	int tries, i;
 	vm_paddr_t low, high, boundary;
 
 	low = 0;
 	high = -1UL;
 	boundary = 0;
-	pflags = VM_ALLOC_NORMAL  | VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
-	    VM_ALLOC_WIRED | VM_ALLOC_ZERO;
 	tries = 0;
 retry:
-	m = vm_page_alloc_contig(NULL, 0, pflags, npages, low, high, alignment,
-	    boundary, memattr);
+	m = vm_page_alloc_noobj_contig(VM_ALLOC_WIRE | VM_ALLOC_ZERO, npages,
+	    low, high, alignment, boundary, memattr);
 	if (m == NULL) {
 		if (tries < 3) {
-			if (!vm_page_reclaim_contig(pflags, npages, low, high,
+			if (!vm_page_reclaim_contig(0, npages, low, high,
 			    alignment, boundary))
 				vm_wait(NULL);
 			tries++;
 			goto retry;
 		}
 		return (ENOMEM);
 	}
 
 	for (i = 0; i < npages; i++, m++) {
-		if ((m->flags & PG_ZERO) == 0)
-			pmap_zero_page(m);
 		m->valid = VM_PAGE_BITS_ALL;
 		(*ret_page)[i] = m;
 	}
 
 	return (0);
 }
 
 /* Initialize pager and insert all object pages to it*/
 static int
 tegra_bo_init_pager(struct tegra_bo *bo)
 {
 	vm_page_t m;
 	size_t size;
 	int i;
 
 	size = round_page(bo->gem_obj.size);
 
 	bo->pbase = VM_PAGE_TO_PHYS(bo->m[0]);
 	if (vmem_alloc(kmem_arena, size, M_WAITOK | M_BESTFIT, &bo->vbase))
 		return (ENOMEM);
 
 	VM_OBJECT_WLOCK(bo->cdev_pager);
 	for (i = 0; i < bo->npages; i++) {
 		m = bo->m[i];
 		/*
 		 * XXX This is a temporary hack.
 		 * We need pager suitable for paging (mmap) managed
 		 * real (non-fictitious) pages.
 		 * - managed pages are needed for clean module unload.
 		 * - aliasing fictitious page to real one is bad,
 		 *   pmap cannot handle this situation without issues
 		 *   It expects that
 		 *    paddr = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(paddr))
 		 *   for every single page passed to pmap.
 		 */
 		m->oflags &= ~VPO_UNMANAGED;
 		m->flags |= PG_FICTITIOUS;
 		if (vm_page_insert(m, bo->cdev_pager, i) != 0)
 			return (EINVAL);
 	}
 	VM_OBJECT_WUNLOCK(bo->cdev_pager);
 
 	pmap_qenter(bo->vbase, bo->m, bo->npages);
 	return (0);
 }
 
 /* Allocate memory for frame buffer */
 static int
 tegra_bo_alloc(struct drm_device *drm, struct tegra_bo *bo)
 {
 	size_t size;
 	int rv;
 
 	size = bo->gem_obj.size;
 
 	bo->npages = atop(size);
 	bo->m = malloc(sizeof(vm_page_t *) * bo->npages, DRM_MEM_DRIVER,
 	    M_WAITOK | M_ZERO);
 
 	rv = tegra_bo_alloc_contig(bo->npages, PAGE_SIZE,
 	    VM_MEMATTR_WRITE_COMBINING, &(bo->m));
 	if (rv != 0) {
 		DRM_WARNING("Cannot allocate memory for gem object.\n");
 		return (rv);
 	}
 	rv = tegra_bo_init_pager(bo);
 	if (rv != 0) {
 		DRM_WARNING("Cannot initialize gem object pager.\n");
 		return (rv);
 	}
 	return (0);
 }
 
 int
 tegra_bo_create(struct drm_device *drm, size_t size, struct tegra_bo **res_bo)
 {
 	struct tegra_bo *bo;
 	int rv;
 
 	if (size <= 0)
 		return (-EINVAL);
 
 	bo = malloc(sizeof(*bo), DRM_MEM_DRIVER, M_WAITOK | M_ZERO);
 
 	size = round_page(size);
 	rv = drm_gem_object_init(drm, &bo->gem_obj, size);
 	if (rv != 0) {
 		free(bo, DRM_MEM_DRIVER);
 		return (rv);
 	}
 	rv = drm_gem_create_mmap_offset(&bo->gem_obj);
 	if (rv != 0) {
 		drm_gem_object_release(&bo->gem_obj);
 		free(bo, DRM_MEM_DRIVER);
 		return (rv);
 	}
 
 	bo->cdev_pager = cdev_pager_allocate(&bo->gem_obj, OBJT_MGTDEVICE,
 	    drm->driver->gem_pager_ops, size, 0, 0, NULL);
 	rv = tegra_bo_alloc(drm, bo);
 	if (rv != 0) {
 		tegra_bo_free_object(&bo->gem_obj);
 		return (rv);
 	}
 
 	*res_bo = bo;
 	return (0);
 }
 
 static int
 tegra_bo_create_with_handle(struct drm_file *file, struct drm_device *drm,
     size_t size, uint32_t *handle, struct tegra_bo **res_bo)
 {
 	int rv;
 	struct tegra_bo *bo;
 
 	rv = tegra_bo_create(drm, size, &bo);
 	if (rv != 0)
 		return (rv);
 
 	rv = drm_gem_handle_create(file, &bo->gem_obj, handle);
 	if (rv != 0) {
 		tegra_bo_free_object(&bo->gem_obj);
 		drm_gem_object_release(&bo->gem_obj);
 		return (rv);
 	}
 
 	drm_gem_object_unreference_unlocked(&bo->gem_obj);
 
 	*res_bo = bo;
 	return (0);
 }
 
 static int
 tegra_bo_dumb_create(struct drm_file *file, struct drm_device *drm_dev,
     struct drm_mode_create_dumb *args)
 {
 	struct tegra_drm *drm;
 	struct tegra_bo *bo;
 	int rv;
 
 	drm = container_of(drm_dev, struct tegra_drm, drm_dev);
 
 	args->pitch= (args->width * args->bpp + 7) / 8;
 	args->pitch = roundup(args->pitch, drm->pitch_align);
 	args->size = args->pitch * args->height;
 	rv = tegra_bo_create_with_handle(file, drm_dev, args->size,
 	    &args->handle, &bo);
 
 	return (rv);
 }
 
 static int
 tegra_bo_dumb_map_offset(struct drm_file *file_priv,
     struct drm_device *drm_dev, uint32_t handle, uint64_t *offset)
 {
 	struct drm_gem_object *gem_obj;
 	int rv;
 
 	DRM_LOCK(drm_dev);
 	gem_obj = drm_gem_object_lookup(drm_dev, file_priv, handle);
 	if (gem_obj == NULL) {
 		device_printf(drm_dev->dev, "Object not found\n");
 		DRM_UNLOCK(drm_dev);
 		return (-EINVAL);
 	}
 	rv = drm_gem_create_mmap_offset(gem_obj);
 	if (rv != 0)
 		goto fail;
 
 	*offset = DRM_GEM_MAPPING_OFF(gem_obj->map_list.key) |
 	    DRM_GEM_MAPPING_KEY;
 
 	drm_gem_object_unreference(gem_obj);
 	DRM_UNLOCK(drm_dev);
 	return (0);
 
 fail:
 	drm_gem_object_unreference(gem_obj);
 	DRM_UNLOCK(drm_dev);
 	return (rv);
 }
 
 static int
 tegra_bo_dumb_destroy(struct drm_file *file_priv, struct drm_device *drm_dev,
     unsigned int handle)
 {
 	int rv;
 
 	rv = drm_gem_handle_delete(file_priv, handle);
 	return (rv);
 }
 
 /*
  * mmap support
  */
 static int
 tegra_gem_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,
     vm_page_t *mres)
 {
 
 #ifdef DRM_PAGER_DEBUG
 	DRM_DEBUG("object %p offset %jd prot %d mres %p\n",
 	    vm_obj, (intmax_t)offset, prot, mres);
 #endif
 	return (VM_PAGER_FAIL);
 
 }
 
 static int
 tegra_gem_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred, u_short *color)
 {
 
 	if (color != NULL)
 		*color = 0;
 	return (0);
 }
 
 static void
 tegra_gem_pager_dtor(void *handle)
 {
 
 }
 
 static struct cdev_pager_ops tegra_gem_pager_ops = {
 	.cdev_pg_fault = tegra_gem_pager_fault,
 	.cdev_pg_ctor  = tegra_gem_pager_ctor,
 	.cdev_pg_dtor  = tegra_gem_pager_dtor
 };
 
 /* Fill up relevant fields in drm_driver ops */
 void
 tegra_bo_driver_register(struct drm_driver *drm_drv)
 {
 	drm_drv->gem_free_object = tegra_bo_free_object;
 	drm_drv->gem_pager_ops = &tegra_gem_pager_ops;
 	drm_drv->dumb_create = tegra_bo_dumb_create;
 	drm_drv->dumb_map_offset = tegra_bo_dumb_map_offset;
 	drm_drv->dumb_destroy = tegra_bo_dumb_destroy;
 }
diff --git a/sys/compat/linuxkpi/common/src/linux_page.c b/sys/compat/linuxkpi/common/src/linux_page.c
index f712c0c155a4..3c8bc2bd3c5b 100644
--- a/sys/compat/linuxkpi/common/src/linux_page.c
+++ b/sys/compat/linuxkpi/common/src/linux_page.c
@@ -1,370 +1,359 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2016 Matthew Macy (mmacy@mattmacy.io)
  * Copyright (c) 2017 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 
 #include <machine/bus.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/vm_extern.h>
 
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/preempt.h>
 #include <linux/fs.h>
 #include <linux/shmem_fs.h>
 
 void
 si_meminfo(struct sysinfo *si)
 {
 	si->totalram = physmem;
 	si->totalhigh = 0;
 	si->mem_unit = PAGE_SIZE;
 }
 
 void *
 linux_page_address(struct page *page)
 {
 
 	if (page->object != kernel_object) {
 		return (PMAP_HAS_DMAP ?
 		    ((void *)(uintptr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(page))) :
 		    NULL);
 	}
 	return ((void *)(uintptr_t)(VM_MIN_KERNEL_ADDRESS +
 	    IDX_TO_OFF(page->pindex)));
 }
 
 vm_page_t
 linux_alloc_pages(gfp_t flags, unsigned int order)
 {
 	vm_page_t page;
 
 	if (PMAP_HAS_DMAP) {
 		unsigned long npages = 1UL << order;
-		int req = VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_NORMAL;
+		int req = VM_ALLOC_WIRED;
 
 		if ((flags & M_ZERO) != 0)
 			req |= VM_ALLOC_ZERO;
 		if (order == 0 && (flags & GFP_DMA32) == 0) {
 			page = vm_page_alloc_noobj(req);
 			if (page == NULL)
 				return (NULL);
 		} else {
 			vm_paddr_t pmax = (flags & GFP_DMA32) ?
 			    BUS_SPACE_MAXADDR_32BIT : BUS_SPACE_MAXADDR;
 		retry:
-			page = vm_page_alloc_contig(NULL, 0, req,
-			    npages, 0, pmax, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
-
+			page = vm_page_alloc_noobj_contig(req, npages, 0, pmax,
+			    PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
 			if (page == NULL) {
 				if (flags & M_WAITOK) {
 					if (!vm_page_reclaim_contig(req,
 					    npages, 0, pmax, PAGE_SIZE, 0)) {
 						vm_wait(NULL);
 					}
 					flags &= ~M_WAITOK;
 					goto retry;
 				}
 				return (NULL);
 			}
 		}
-		if (flags & M_ZERO) {
-			unsigned long x;
-
-			for (x = 0; x != npages; x++) {
-				vm_page_t pgo = page + x;
-
-				if ((pgo->flags & PG_ZERO) == 0)
-					pmap_zero_page(pgo);
-			}
-		}
 	} else {
 		vm_offset_t vaddr;
 
 		vaddr = linux_alloc_kmem(flags, order);
 		if (vaddr == 0)
 			return (NULL);
 
 		page = PHYS_TO_VM_PAGE(vtophys((void *)vaddr));
 
 		KASSERT(vaddr == (vm_offset_t)page_address(page),
 		    ("Page address mismatch"));
 	}
 
 	return (page);
 }
 
 void
 linux_free_pages(vm_page_t page, unsigned int order)
 {
 	if (PMAP_HAS_DMAP) {
 		unsigned long npages = 1UL << order;
 		unsigned long x;
 
 		for (x = 0; x != npages; x++) {
 			vm_page_t pgo = page + x;
 
 			if (vm_page_unwire_noq(pgo))
 				vm_page_free(pgo);
 		}
 	} else {
 		vm_offset_t vaddr;
 
 		vaddr = (vm_offset_t)page_address(page);
 
 		linux_free_kmem(vaddr, order);
 	}
 }
 
 vm_offset_t
 linux_alloc_kmem(gfp_t flags, unsigned int order)
 {
 	size_t size = ((size_t)PAGE_SIZE) << order;
 	vm_offset_t addr;
 
 	if ((flags & GFP_DMA32) == 0) {
 		addr = kmem_malloc(size, flags & GFP_NATIVE_MASK);
 	} else {
 		addr = kmem_alloc_contig(size, flags & GFP_NATIVE_MASK, 0,
 		    BUS_SPACE_MAXADDR_32BIT, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
 	}
 	return (addr);
 }
 
 void
 linux_free_kmem(vm_offset_t addr, unsigned int order)
 {
 	size_t size = ((size_t)PAGE_SIZE) << order;
 
 	kmem_free(addr, size);
 }
 
 static int
 linux_get_user_pages_internal(vm_map_t map, unsigned long start, int nr_pages,
     int write, struct page **pages)
 {
 	vm_prot_t prot;
 	size_t len;
 	int count;
 
 	prot = write ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ;
 	len = ptoa((vm_offset_t)nr_pages);
 	count = vm_fault_quick_hold_pages(map, start, len, prot, pages, nr_pages);
 	return (count == -1 ? -EFAULT : nr_pages);
 }
 
 int
 __get_user_pages_fast(unsigned long start, int nr_pages, int write,
     struct page **pages)
 {
 	vm_map_t map;
 	vm_page_t *mp;
 	vm_offset_t va;
 	vm_offset_t end;
 	vm_prot_t prot;
 	int count;
 
 	if (nr_pages == 0 || in_interrupt())
 		return (0);
 
 	MPASS(pages != NULL);
 	map = &curthread->td_proc->p_vmspace->vm_map;
 	end = start + ptoa((vm_offset_t)nr_pages);
 	if (!vm_map_range_valid(map, start, end))
 		return (-EINVAL);
 	prot = write ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ;
 	for (count = 0, mp = pages, va = start; va < end;
 	    mp++, va += PAGE_SIZE, count++) {
 		*mp = pmap_extract_and_hold(map->pmap, va, prot);
 		if (*mp == NULL)
 			break;
 
 		if ((prot & VM_PROT_WRITE) != 0 &&
 		    (*mp)->dirty != VM_PAGE_BITS_ALL) {
 			/*
 			 * Explicitly dirty the physical page.  Otherwise, the
 			 * caller's changes may go unnoticed because they are
 			 * performed through an unmanaged mapping or by a DMA
 			 * operation.
 			 *
 			 * The object lock is not held here.
 			 * See vm_page_clear_dirty_mask().
 			 */
 			vm_page_dirty(*mp);
 		}
 	}
 	return (count);
 }
 
 long
 get_user_pages_remote(struct task_struct *task, struct mm_struct *mm,
     unsigned long start, unsigned long nr_pages, int gup_flags,
     struct page **pages, struct vm_area_struct **vmas)
 {
 	vm_map_t map;
 
 	map = &task->task_thread->td_proc->p_vmspace->vm_map;
 	return (linux_get_user_pages_internal(map, start, nr_pages,
 	    !!(gup_flags & FOLL_WRITE), pages));
 }
 
 long
 get_user_pages(unsigned long start, unsigned long nr_pages, int gup_flags,
     struct page **pages, struct vm_area_struct **vmas)
 {
 	vm_map_t map;
 
 	map = &curthread->td_proc->p_vmspace->vm_map;
 	return (linux_get_user_pages_internal(map, start, nr_pages,
 	    !!(gup_flags & FOLL_WRITE), pages));
 }
 
 int
 is_vmalloc_addr(const void *addr)
 {
 	return (vtoslab((vm_offset_t)addr & ~UMA_SLAB_MASK) != NULL);
 }
 
 vm_fault_t
 lkpi_vmf_insert_pfn_prot_locked(struct vm_area_struct *vma, unsigned long addr,
     unsigned long pfn, pgprot_t prot)
 {
 	vm_object_t vm_obj = vma->vm_obj;
 	vm_object_t tmp_obj;
 	vm_page_t page;
 	vm_pindex_t pindex;
 
 	VM_OBJECT_ASSERT_WLOCKED(vm_obj);
 	pindex = OFF_TO_IDX(addr - vma->vm_start);
 	if (vma->vm_pfn_count == 0)
 		vma->vm_pfn_first = pindex;
 	MPASS(pindex <= OFF_TO_IDX(vma->vm_end));
 
 retry:
 	page = vm_page_grab(vm_obj, pindex, VM_ALLOC_NOCREAT);
 	if (page == NULL) {
 		page = PHYS_TO_VM_PAGE(IDX_TO_OFF(pfn));
 		if (!vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL))
 			goto retry;
 		if (page->object != NULL) {
 			tmp_obj = page->object;
 			vm_page_xunbusy(page);
 			VM_OBJECT_WUNLOCK(vm_obj);
 			VM_OBJECT_WLOCK(tmp_obj);
 			if (page->object == tmp_obj &&
 			    vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL)) {
 				KASSERT(page->object == tmp_obj,
 				    ("page has changed identity"));
 				KASSERT((page->oflags & VPO_UNMANAGED) == 0,
 				    ("page does not belong to shmem"));
 				vm_pager_page_unswapped(page);
 				if (pmap_page_is_mapped(page)) {
 					vm_page_xunbusy(page);
 					VM_OBJECT_WUNLOCK(tmp_obj);
 					printf("%s: page rename failed: page "
 					    "is mapped\n", __func__);
 					VM_OBJECT_WLOCK(vm_obj);
 					return (VM_FAULT_NOPAGE);
 				}
 				vm_page_remove(page);
 			}
 			VM_OBJECT_WUNLOCK(tmp_obj);
 			VM_OBJECT_WLOCK(vm_obj);
 			goto retry;
 		}
 		if (vm_page_insert(page, vm_obj, pindex)) {
 			vm_page_xunbusy(page);
 			return (VM_FAULT_OOM);
 		}
 		vm_page_valid(page);
 	}
 	pmap_page_set_memattr(page, pgprot2cachemode(prot));
 	vma->vm_pfn_count++;
 
 	return (VM_FAULT_NOPAGE);
 }
 
 /*
  * Although FreeBSD version of unmap_mapping_range has semantics and types of
  * parameters compatible with Linux version, the values passed in are different
  * @obj should match to vm_private_data field of vm_area_struct returned by
  *      mmap file operation handler, see linux_file_mmap_single() sources
  * @holelen should match to size of area to be munmapped.
  */
 void
 lkpi_unmap_mapping_range(void *obj, loff_t const holebegin __unused,
     loff_t const holelen, int even_cows __unused)
 {
 	vm_object_t devobj;
 	vm_page_t page;
 	int i, page_count;
 
 	devobj = cdev_pager_lookup(obj);
 	if (devobj != NULL) {
 		page_count = OFF_TO_IDX(holelen);
 
 		VM_OBJECT_WLOCK(devobj);
 retry:
 		for (i = 0; i < page_count; i++) {
 			page = vm_page_lookup(devobj, i);
 			if (page == NULL)
 				continue;
 			if (!vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL))
 				goto retry;
 			cdev_pager_free_page(devobj, page);
 		}
 		VM_OBJECT_WUNLOCK(devobj);
 		vm_object_deallocate(devobj);
 	}
 }
diff --git a/sys/dev/drm2/ttm/ttm_bo.c b/sys/dev/drm2/ttm/ttm_bo.c
index 010afe6d8b3b..d5c11ecff25d 100644
--- a/sys/dev/drm2/ttm/ttm_bo.c
+++ b/sys/dev/drm2/ttm/ttm_bo.c
@@ -1,1895 +1,1894 @@
 /**************************************************************************
  *
  * Copyright (c) 2006-2009 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  **************************************************************************/
 /*
  * Authors: Thomas Hellstrom <thellstrom-at-vmware-dot-com>
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <dev/drm2/drmP.h>
 #include <dev/drm2/ttm/ttm_module.h>
 #include <dev/drm2/ttm/ttm_bo_driver.h>
 #include <dev/drm2/ttm/ttm_placement.h>
 #include <vm/vm_pageout.h>
 
 #define TTM_ASSERT_LOCKED(param)
 #define TTM_DEBUG(fmt, arg...)
 #define TTM_BO_HASH_ORDER 13
 
 static int ttm_bo_setup_vm(struct ttm_buffer_object *bo);
 static int ttm_bo_swapout(struct ttm_mem_shrink *shrink);
 static void ttm_bo_global_kobj_release(struct ttm_bo_global *glob);
 
 MALLOC_DEFINE(M_TTM_BO, "ttm_bo", "TTM Buffer Objects");
 
 static inline int ttm_mem_type_from_flags(uint32_t flags, uint32_t *mem_type)
 {
 	int i;
 
 	for (i = 0; i <= TTM_PL_PRIV5; i++)
 		if (flags & (1 << i)) {
 			*mem_type = i;
 			return 0;
 		}
 	return -EINVAL;
 }
 
 static void ttm_mem_type_debug(struct ttm_bo_device *bdev, int mem_type)
 {
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
 
 	printf("    has_type: %d\n", man->has_type);
 	printf("    use_type: %d\n", man->use_type);
 	printf("    flags: 0x%08X\n", man->flags);
 	printf("    gpu_offset: 0x%08lX\n", man->gpu_offset);
 	printf("    size: %ju\n", (uintmax_t)man->size);
 	printf("    available_caching: 0x%08X\n", man->available_caching);
 	printf("    default_caching: 0x%08X\n", man->default_caching);
 	if (mem_type != TTM_PL_SYSTEM)
 		(*man->func->debug)(man, TTM_PFX);
 }
 
 static void ttm_bo_mem_space_debug(struct ttm_buffer_object *bo,
 					struct ttm_placement *placement)
 {
 	int i, ret, mem_type;
 
 	printf("No space for %p (%lu pages, %luK, %luM)\n",
 	       bo, bo->mem.num_pages, bo->mem.size >> 10,
 	       bo->mem.size >> 20);
 	for (i = 0; i < placement->num_placement; i++) {
 		ret = ttm_mem_type_from_flags(placement->placement[i],
 						&mem_type);
 		if (ret)
 			return;
 		printf("  placement[%d]=0x%08X (%d)\n",
 		       i, placement->placement[i], mem_type);
 		ttm_mem_type_debug(bo->bdev, mem_type);
 	}
 }
 
 #if 0
 static ssize_t ttm_bo_global_show(struct ttm_bo_global *glob,
     char *buffer)
 {
 
 	return snprintf(buffer, PAGE_SIZE, "%lu\n",
 			(unsigned long) atomic_read(&glob->bo_count));
 }
 #endif
 
 static inline uint32_t ttm_bo_type_flags(unsigned type)
 {
 	return 1 << (type);
 }
 
 static void ttm_bo_release_list(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	size_t acc_size = bo->acc_size;
 
 	MPASS(atomic_read(&bo->list_kref) == 0);
 	MPASS(atomic_read(&bo->kref) == 0);
 	MPASS(atomic_read(&bo->cpu_writers) == 0);
 	MPASS(bo->sync_obj == NULL);
 	MPASS(bo->mem.mm_node == NULL);
 	MPASS(list_empty(&bo->lru));
 	MPASS(list_empty(&bo->ddestroy));
 
 	if (bo->ttm)
 		ttm_tt_destroy(bo->ttm);
 	atomic_dec(&bo->glob->bo_count);
 	if (bo->destroy)
 		bo->destroy(bo);
 	else {
 		free(bo, M_TTM_BO);
 	}
 	ttm_mem_global_free(bdev->glob->mem_glob, acc_size);
 }
 
 static int
 ttm_bo_wait_unreserved_locked(struct ttm_buffer_object *bo, bool interruptible)
 {
 	const char *wmsg;
 	int flags, ret;
 
 	ret = 0;
 	if (interruptible) {
 		flags = PCATCH;
 		wmsg = "ttbowi";
 	} else {
 		flags = 0;
 		wmsg = "ttbowu";
 	}
 	while (ttm_bo_is_reserved(bo)) {
 		ret = -msleep(bo, &bo->glob->lru_lock, flags, wmsg, 0);
 		if (ret == -EINTR || ret == -ERESTART)
 			ret = -ERESTARTSYS;
 		if (ret != 0)
 			break;
 	}
 	return (ret);
 }
 
 void ttm_bo_add_to_lru(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_mem_type_manager *man;
 
 	MPASS(ttm_bo_is_reserved(bo));
 
 	if (!(bo->mem.placement & TTM_PL_FLAG_NO_EVICT)) {
 
 		MPASS(list_empty(&bo->lru));
 
 		man = &bdev->man[bo->mem.mem_type];
 		list_add_tail(&bo->lru, &man->lru);
 		refcount_acquire(&bo->list_kref);
 
 		if (bo->ttm != NULL) {
 			list_add_tail(&bo->swap, &bo->glob->swap_lru);
 			refcount_acquire(&bo->list_kref);
 		}
 	}
 }
 
 int ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
 {
 	int put_count = 0;
 
 	if (!list_empty(&bo->swap)) {
 		list_del_init(&bo->swap);
 		++put_count;
 	}
 	if (!list_empty(&bo->lru)) {
 		list_del_init(&bo->lru);
 		++put_count;
 	}
 
 	/*
 	 * TODO: Add a driver hook to delete from
 	 * driver-specific LRU's here.
 	 */
 
 	return put_count;
 }
 
 int ttm_bo_reserve_nolru(struct ttm_buffer_object *bo,
 			  bool interruptible,
 			  bool no_wait, bool use_sequence, uint32_t sequence)
 {
 	int ret;
 
 	while (unlikely(atomic_xchg(&bo->reserved, 1) != 0)) {
 		/**
 		 * Deadlock avoidance for multi-bo reserving.
 		 */
 		if (use_sequence && bo->seq_valid) {
 			/**
 			 * We've already reserved this one.
 			 */
 			if (unlikely(sequence == bo->val_seq))
 				return -EDEADLK;
 			/**
 			 * Already reserved by a thread that will not back
 			 * off for us. We need to back off.
 			 */
 			if (unlikely(sequence - bo->val_seq < (1U << 31)))
 				return -EAGAIN;
 		}
 
 		if (no_wait)
 			return -EBUSY;
 
 		ret = ttm_bo_wait_unreserved_locked(bo, interruptible);
 
 		if (unlikely(ret))
 			return ret;
 	}
 
 	if (use_sequence) {
 		bool wake_up = false;
 		/**
 		 * Wake up waiters that may need to recheck for deadlock,
 		 * if we decreased the sequence number.
 		 */
 		if (unlikely((bo->val_seq - sequence < (1U << 31))
 			     || !bo->seq_valid))
 			wake_up = true;
 
 		/*
 		 * In the worst case with memory ordering these values can be
 		 * seen in the wrong order. However since we call wake_up_all
 		 * in that case, this will hopefully not pose a problem,
 		 * and the worst case would only cause someone to accidentally
 		 * hit -EAGAIN in ttm_bo_reserve when they see old value of
 		 * val_seq. However this would only happen if seq_valid was
 		 * written before val_seq was, and just means some slightly
 		 * increased cpu usage
 		 */
 		bo->val_seq = sequence;
 		bo->seq_valid = true;
 		if (wake_up)
 			wakeup(bo);
 	} else {
 		bo->seq_valid = false;
 	}
 
 	return 0;
 }
 
 void ttm_bo_list_ref_sub(struct ttm_buffer_object *bo, int count,
 			 bool never_free)
 {
 	u_int old;
 
 	old = atomic_fetchadd_int(&bo->list_kref, -count);
 	if (old <= count) {
 		if (never_free)
 			panic("ttm_bo_ref_buf");
 		ttm_bo_release_list(bo);
 	}
 }
 
 int ttm_bo_reserve(struct ttm_buffer_object *bo,
 		   bool interruptible,
 		   bool no_wait, bool use_sequence, uint32_t sequence)
 {
 	struct ttm_bo_global *glob = bo->glob;
 	int put_count = 0;
 	int ret;
 
 	mtx_lock(&bo->glob->lru_lock);
 	ret = ttm_bo_reserve_nolru(bo, interruptible, no_wait, use_sequence,
 				   sequence);
 	if (likely(ret == 0)) {
 		put_count = ttm_bo_del_from_lru(bo);
 		mtx_unlock(&glob->lru_lock);
 		ttm_bo_list_ref_sub(bo, put_count, true);
 	} else
 		mtx_unlock(&bo->glob->lru_lock);
 
 	return ret;
 }
 
 int ttm_bo_reserve_slowpath_nolru(struct ttm_buffer_object *bo,
 				  bool interruptible, uint32_t sequence)
 {
 	bool wake_up = false;
 	int ret;
 
 	while (unlikely(atomic_xchg(&bo->reserved, 1) != 0)) {
 		if (bo->seq_valid && sequence == bo->val_seq) {
 			DRM_ERROR(
 			    "%s: bo->seq_valid && sequence == bo->val_seq",
 			    __func__);
 		}
 
 		ret = ttm_bo_wait_unreserved_locked(bo, interruptible);
 
 		if (unlikely(ret))
 			return ret;
 	}
 
 	if ((bo->val_seq - sequence < (1U << 31)) || !bo->seq_valid)
 		wake_up = true;
 
 	/**
 	 * Wake up waiters that may need to recheck for deadlock,
 	 * if we decreased the sequence number.
 	 */
 	bo->val_seq = sequence;
 	bo->seq_valid = true;
 	if (wake_up)
 		wakeup(bo);
 
 	return 0;
 }
 
 int ttm_bo_reserve_slowpath(struct ttm_buffer_object *bo,
 			    bool interruptible, uint32_t sequence)
 {
 	struct ttm_bo_global *glob = bo->glob;
 	int put_count, ret;
 
 	mtx_lock(&glob->lru_lock);
 	ret = ttm_bo_reserve_slowpath_nolru(bo, interruptible, sequence);
 	if (likely(!ret)) {
 		put_count = ttm_bo_del_from_lru(bo);
 		mtx_unlock(&glob->lru_lock);
 		ttm_bo_list_ref_sub(bo, put_count, true);
 	} else
 		mtx_unlock(&glob->lru_lock);
 	return ret;
 }
 
 void ttm_bo_unreserve_locked(struct ttm_buffer_object *bo)
 {
 	ttm_bo_add_to_lru(bo);
 	atomic_set(&bo->reserved, 0);
 	wakeup(bo);
 }
 
 void ttm_bo_unreserve(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_global *glob = bo->glob;
 
 	mtx_lock(&glob->lru_lock);
 	ttm_bo_unreserve_locked(bo);
 	mtx_unlock(&glob->lru_lock);
 }
 
 /*
  * Call bo->mutex locked.
  */
 static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_bo_global *glob = bo->glob;
 	int ret = 0;
 	uint32_t page_flags = 0;
 
 	TTM_ASSERT_LOCKED(&bo->mutex);
 	bo->ttm = NULL;
 
 	if (bdev->need_dma32)
 		page_flags |= TTM_PAGE_FLAG_DMA32;
 
 	switch (bo->type) {
 	case ttm_bo_type_device:
 		if (zero_alloc)
 			page_flags |= TTM_PAGE_FLAG_ZERO_ALLOC;
 	case ttm_bo_type_kernel:
 		bo->ttm = bdev->driver->ttm_tt_create(bdev, bo->num_pages << PAGE_SHIFT,
 						      page_flags, glob->dummy_read_page);
 		if (unlikely(bo->ttm == NULL))
 			ret = -ENOMEM;
 		break;
 	case ttm_bo_type_sg:
 		bo->ttm = bdev->driver->ttm_tt_create(bdev, bo->num_pages << PAGE_SHIFT,
 						      page_flags | TTM_PAGE_FLAG_SG,
 						      glob->dummy_read_page);
 		if (unlikely(bo->ttm == NULL)) {
 			ret = -ENOMEM;
 			break;
 		}
 		bo->ttm->sg = bo->sg;
 		break;
 	default:
 		printf("[TTM] Illegal buffer object type\n");
 		ret = -EINVAL;
 		break;
 	}
 
 	return ret;
 }
 
 static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo,
 				  struct ttm_mem_reg *mem,
 				  bool evict, bool interruptible,
 				  bool no_wait_gpu)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	bool old_is_pci = ttm_mem_reg_is_pci(bdev, &bo->mem);
 	bool new_is_pci = ttm_mem_reg_is_pci(bdev, mem);
 	struct ttm_mem_type_manager *old_man = &bdev->man[bo->mem.mem_type];
 	struct ttm_mem_type_manager *new_man = &bdev->man[mem->mem_type];
 	int ret = 0;
 
 	if (old_is_pci || new_is_pci ||
 	    ((mem->placement & bo->mem.placement & TTM_PL_MASK_CACHING) == 0)) {
 		ret = ttm_mem_io_lock(old_man, true);
 		if (unlikely(ret != 0))
 			goto out_err;
 		ttm_bo_unmap_virtual_locked(bo);
 		ttm_mem_io_unlock(old_man);
 	}
 
 	/*
 	 * Create and bind a ttm if required.
 	 */
 
 	if (!(new_man->flags & TTM_MEMTYPE_FLAG_FIXED)) {
 		if (bo->ttm == NULL) {
 			bool zero = !(old_man->flags & TTM_MEMTYPE_FLAG_FIXED);
 			ret = ttm_bo_add_ttm(bo, zero);
 			if (ret)
 				goto out_err;
 		}
 
 		ret = ttm_tt_set_placement_caching(bo->ttm, mem->placement);
 		if (ret)
 			goto out_err;
 
 		if (mem->mem_type != TTM_PL_SYSTEM) {
 			ret = ttm_tt_bind(bo->ttm, mem);
 			if (ret)
 				goto out_err;
 		}
 
 		if (bo->mem.mem_type == TTM_PL_SYSTEM) {
 			if (bdev->driver->move_notify)
 				bdev->driver->move_notify(bo, mem);
 			bo->mem = *mem;
 			mem->mm_node = NULL;
 			goto moved;
 		}
 	}
 
 	if (bdev->driver->move_notify)
 		bdev->driver->move_notify(bo, mem);
 
 	if (!(old_man->flags & TTM_MEMTYPE_FLAG_FIXED) &&
 	    !(new_man->flags & TTM_MEMTYPE_FLAG_FIXED))
 		ret = ttm_bo_move_ttm(bo, evict, no_wait_gpu, mem);
 	else if (bdev->driver->move)
 		ret = bdev->driver->move(bo, evict, interruptible,
 					 no_wait_gpu, mem);
 	else
 		ret = ttm_bo_move_memcpy(bo, evict, no_wait_gpu, mem);
 
 	if (ret) {
 		if (bdev->driver->move_notify) {
 			struct ttm_mem_reg tmp_mem = *mem;
 			*mem = bo->mem;
 			bo->mem = tmp_mem;
 			bdev->driver->move_notify(bo, mem);
 			bo->mem = *mem;
 			*mem = tmp_mem;
 		}
 
 		goto out_err;
 	}
 
 moved:
 	if (bo->evicted) {
 		ret = bdev->driver->invalidate_caches(bdev, bo->mem.placement);
 		if (ret)
 			printf("[TTM] Can not flush read caches\n");
 		bo->evicted = false;
 	}
 
 	if (bo->mem.mm_node) {
 		bo->offset = (bo->mem.start << PAGE_SHIFT) +
 		    bdev->man[bo->mem.mem_type].gpu_offset;
 		bo->cur_placement = bo->mem.placement;
 	} else
 		bo->offset = 0;
 
 	return 0;
 
 out_err:
 	new_man = &bdev->man[bo->mem.mem_type];
 	if ((new_man->flags & TTM_MEMTYPE_FLAG_FIXED) && bo->ttm) {
 		ttm_tt_unbind(bo->ttm);
 		ttm_tt_destroy(bo->ttm);
 		bo->ttm = NULL;
 	}
 
 	return ret;
 }
 
 /**
  * Call bo::reserved.
  * Will release GPU memory type usage on destruction.
  * This is the place to put in driver specific hooks to release
  * driver private resources.
  * Will release the bo::reserved lock.
  */
 
 static void ttm_bo_cleanup_memtype_use(struct ttm_buffer_object *bo)
 {
 	if (bo->bdev->driver->move_notify)
 		bo->bdev->driver->move_notify(bo, NULL);
 
 	if (bo->ttm) {
 		ttm_tt_unbind(bo->ttm);
 		ttm_tt_destroy(bo->ttm);
 		bo->ttm = NULL;
 	}
 	ttm_bo_mem_put(bo, &bo->mem);
 
 	atomic_set(&bo->reserved, 0);
 	wakeup(&bo);
 
 	/*
 	 * Since the final reference to this bo may not be dropped by
 	 * the current task we have to put a memory barrier here to make
 	 * sure the changes done in this function are always visible.
 	 *
 	 * This function only needs protection against the final kref_put.
 	 */
 	mb();
 }
 
 static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_bo_global *glob = bo->glob;
 	struct ttm_bo_driver *driver = bdev->driver;
 	void *sync_obj = NULL;
 	int put_count;
 	int ret;
 
 	mtx_lock(&glob->lru_lock);
 	ret = ttm_bo_reserve_nolru(bo, false, true, false, 0);
 
 	mtx_lock(&bdev->fence_lock);
 	(void) ttm_bo_wait(bo, false, false, true);
 	if (!ret && !bo->sync_obj) {
 		mtx_unlock(&bdev->fence_lock);
 		put_count = ttm_bo_del_from_lru(bo);
 
 		mtx_unlock(&glob->lru_lock);
 		ttm_bo_cleanup_memtype_use(bo);
 
 		ttm_bo_list_ref_sub(bo, put_count, true);
 
 		return;
 	}
 	if (bo->sync_obj)
 		sync_obj = driver->sync_obj_ref(bo->sync_obj);
 	mtx_unlock(&bdev->fence_lock);
 
 	if (!ret) {
 		atomic_set(&bo->reserved, 0);
 		wakeup(bo);
 	}
 
 	refcount_acquire(&bo->list_kref);
 	list_add_tail(&bo->ddestroy, &bdev->ddestroy);
 	mtx_unlock(&glob->lru_lock);
 
 	if (sync_obj) {
 		driver->sync_obj_flush(sync_obj);
 		driver->sync_obj_unref(&sync_obj);
 	}
 	taskqueue_enqueue_timeout(taskqueue_thread, &bdev->wq,
 	    ((hz / 100) < 1) ? 1 : hz / 100);
 }
 
 /**
  * function ttm_bo_cleanup_refs_and_unlock
  * If bo idle, remove from delayed- and lru lists, and unref.
  * If not idle, do nothing.
  *
  * Must be called with lru_lock and reservation held, this function
  * will drop both before returning.
  *
  * @interruptible         Any sleeps should occur interruptibly.
  * @no_wait_gpu           Never wait for gpu. Return -EBUSY instead.
  */
 
 static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo,
 					  bool interruptible,
 					  bool no_wait_gpu)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_bo_driver *driver = bdev->driver;
 	struct ttm_bo_global *glob = bo->glob;
 	int put_count;
 	int ret;
 
 	mtx_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, false, true);
 
 	if (ret && !no_wait_gpu) {
 		void *sync_obj;
 
 		/*
 		 * Take a reference to the fence and unreserve,
 		 * at this point the buffer should be dead, so
 		 * no new sync objects can be attached.
 		 */
 		sync_obj = driver->sync_obj_ref(bo->sync_obj);
 		mtx_unlock(&bdev->fence_lock);
 
 		atomic_set(&bo->reserved, 0);
 		wakeup(bo);
 		mtx_unlock(&glob->lru_lock);
 
 		ret = driver->sync_obj_wait(sync_obj, false, interruptible);
 		driver->sync_obj_unref(&sync_obj);
 		if (ret)
 			return ret;
 
 		/*
 		 * remove sync_obj with ttm_bo_wait, the wait should be
 		 * finished, and no new wait object should have been added.
 		 */
 		mtx_lock(&bdev->fence_lock);
 		ret = ttm_bo_wait(bo, false, false, true);
 		mtx_unlock(&bdev->fence_lock);
 		if (ret)
 			return ret;
 
 		mtx_lock(&glob->lru_lock);
 		ret = ttm_bo_reserve_nolru(bo, false, true, false, 0);
 
 		/*
 		 * We raced, and lost, someone else holds the reservation now,
 		 * and is probably busy in ttm_bo_cleanup_memtype_use.
 		 *
 		 * Even if it's not the case, because we finished waiting any
 		 * delayed destruction would succeed, so just return success
 		 * here.
 		 */
 		if (ret) {
 			mtx_unlock(&glob->lru_lock);
 			return 0;
 		}
 	} else
 		mtx_unlock(&bdev->fence_lock);
 
 	if (ret || unlikely(list_empty(&bo->ddestroy))) {
 		atomic_set(&bo->reserved, 0);
 		wakeup(bo);
 		mtx_unlock(&glob->lru_lock);
 		return ret;
 	}
 
 	put_count = ttm_bo_del_from_lru(bo);
 	list_del_init(&bo->ddestroy);
 	++put_count;
 
 	mtx_unlock(&glob->lru_lock);
 	ttm_bo_cleanup_memtype_use(bo);
 
 	ttm_bo_list_ref_sub(bo, put_count, true);
 
 	return 0;
 }
 
 /**
  * Traverse the delayed list, and call ttm_bo_cleanup_refs on all
  * encountered buffers.
  */
 
 static int ttm_bo_delayed_delete(struct ttm_bo_device *bdev, bool remove_all)
 {
 	struct ttm_bo_global *glob = bdev->glob;
 	struct ttm_buffer_object *entry = NULL;
 	int ret = 0;
 
 	mtx_lock(&glob->lru_lock);
 	if (list_empty(&bdev->ddestroy))
 		goto out_unlock;
 
 	entry = list_first_entry(&bdev->ddestroy,
 		struct ttm_buffer_object, ddestroy);
 	refcount_acquire(&entry->list_kref);
 
 	for (;;) {
 		struct ttm_buffer_object *nentry = NULL;
 
 		if (entry->ddestroy.next != &bdev->ddestroy) {
 			nentry = list_first_entry(&entry->ddestroy,
 				struct ttm_buffer_object, ddestroy);
 			refcount_acquire(&nentry->list_kref);
 		}
 
 		ret = ttm_bo_reserve_nolru(entry, false, true, false, 0);
 		if (remove_all && ret) {
 			ret = ttm_bo_reserve_nolru(entry, false, false,
 						   false, 0);
 		}
 
 		if (!ret)
 			ret = ttm_bo_cleanup_refs_and_unlock(entry, false,
 							     !remove_all);
 		else
 			mtx_unlock(&glob->lru_lock);
 
 		if (refcount_release(&entry->list_kref))
 			ttm_bo_release_list(entry);
 		entry = nentry;
 
 		if (ret || !entry)
 			goto out;
 
 		mtx_lock(&glob->lru_lock);
 		if (list_empty(&entry->ddestroy))
 			break;
 	}
 
 out_unlock:
 	mtx_unlock(&glob->lru_lock);
 out:
 	if (entry && refcount_release(&entry->list_kref))
 		ttm_bo_release_list(entry);
 	return ret;
 }
 
 static void ttm_bo_delayed_workqueue(void *arg, int pending __unused)
 {
 	struct ttm_bo_device *bdev = arg;
 
 	if (ttm_bo_delayed_delete(bdev, false)) {
 		taskqueue_enqueue_timeout(taskqueue_thread, &bdev->wq,
 		    ((hz / 100) < 1) ? 1 : hz / 100);
 	}
 }
 
 static void ttm_bo_release(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_mem_type_manager *man = &bdev->man[bo->mem.mem_type];
 
 	rw_wlock(&bdev->vm_lock);
 	if (likely(bo->vm_node != NULL)) {
 		RB_REMOVE(ttm_bo_device_buffer_objects,
 		    &bdev->addr_space_rb, bo);
 		drm_mm_put_block(bo->vm_node);
 		bo->vm_node = NULL;
 	}
 	rw_wunlock(&bdev->vm_lock);
 	ttm_mem_io_lock(man, false);
 	ttm_mem_io_free_vm(bo);
 	ttm_mem_io_unlock(man);
 	ttm_bo_cleanup_refs_or_queue(bo);
 	if (refcount_release(&bo->list_kref))
 		ttm_bo_release_list(bo);
 }
 
 void ttm_bo_unref(struct ttm_buffer_object **p_bo)
 {
 	struct ttm_buffer_object *bo = *p_bo;
 
 	*p_bo = NULL;
 	if (refcount_release(&bo->kref))
 		ttm_bo_release(bo);
 }
 
 int ttm_bo_lock_delayed_workqueue(struct ttm_bo_device *bdev)
 {
 	int pending;
 
 	if (taskqueue_cancel_timeout(taskqueue_thread, &bdev->wq, &pending))
 		taskqueue_drain_timeout(taskqueue_thread, &bdev->wq);
 	return (pending);
 }
 
 void ttm_bo_unlock_delayed_workqueue(struct ttm_bo_device *bdev, int resched)
 {
 	if (resched) {
 		taskqueue_enqueue_timeout(taskqueue_thread, &bdev->wq,
 		    ((hz / 100) < 1) ? 1 : hz / 100);
 	}
 }
 
 static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible,
 			bool no_wait_gpu)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_mem_reg evict_mem;
 	struct ttm_placement placement;
 	int ret = 0;
 
 	mtx_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
 	mtx_unlock(&bdev->fence_lock);
 
 	if (unlikely(ret != 0)) {
 		if (ret != -ERESTARTSYS) {
 			printf("[TTM] Failed to expire sync object before buffer eviction\n");
 		}
 		goto out;
 	}
 
 	MPASS(ttm_bo_is_reserved(bo));
 
 	evict_mem = bo->mem;
 	evict_mem.mm_node = NULL;
 	evict_mem.bus.io_reserved_vm = false;
 	evict_mem.bus.io_reserved_count = 0;
 
 	placement.fpfn = 0;
 	placement.lpfn = 0;
 	placement.num_placement = 0;
 	placement.num_busy_placement = 0;
 	bdev->driver->evict_flags(bo, &placement);
 	ret = ttm_bo_mem_space(bo, &placement, &evict_mem, interruptible,
 				no_wait_gpu);
 	if (ret) {
 		if (ret != -ERESTARTSYS) {
 			printf("[TTM] Failed to find memory space for buffer 0x%p eviction\n",
 			       bo);
 			ttm_bo_mem_space_debug(bo, &placement);
 		}
 		goto out;
 	}
 
 	ret = ttm_bo_handle_move_mem(bo, &evict_mem, true, interruptible,
 				     no_wait_gpu);
 	if (ret) {
 		if (ret != -ERESTARTSYS)
 			printf("[TTM] Buffer eviction failed\n");
 		ttm_bo_mem_put(bo, &evict_mem);
 		goto out;
 	}
 	bo->evicted = true;
 out:
 	return ret;
 }
 
 static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 				uint32_t mem_type,
 				bool interruptible,
 				bool no_wait_gpu)
 {
 	struct ttm_bo_global *glob = bdev->glob;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
 	struct ttm_buffer_object *bo;
 	int ret = -EBUSY, put_count;
 
 	mtx_lock(&glob->lru_lock);
 	list_for_each_entry(bo, &man->lru, lru) {
 		ret = ttm_bo_reserve_nolru(bo, false, true, false, 0);
 		if (!ret)
 			break;
 	}
 
 	if (ret) {
 		mtx_unlock(&glob->lru_lock);
 		return ret;
 	}
 
 	refcount_acquire(&bo->list_kref);
 
 	if (!list_empty(&bo->ddestroy)) {
 		ret = ttm_bo_cleanup_refs_and_unlock(bo, interruptible,
 						     no_wait_gpu);
 		if (refcount_release(&bo->list_kref))
 			ttm_bo_release_list(bo);
 		return ret;
 	}
 
 	put_count = ttm_bo_del_from_lru(bo);
 	mtx_unlock(&glob->lru_lock);
 
 	MPASS(ret == 0);
 
 	ttm_bo_list_ref_sub(bo, put_count, true);
 
 	ret = ttm_bo_evict(bo, interruptible, no_wait_gpu);
 	ttm_bo_unreserve(bo);
 
 	if (refcount_release(&bo->list_kref))
 		ttm_bo_release_list(bo);
 	return ret;
 }
 
 void ttm_bo_mem_put(struct ttm_buffer_object *bo, struct ttm_mem_reg *mem)
 {
 	struct ttm_mem_type_manager *man = &bo->bdev->man[mem->mem_type];
 
 	if (mem->mm_node)
 		(*man->func->put_node)(man, mem);
 }
 
 /**
  * Repeatedly evict memory from the LRU for @mem_type until we create enough
  * space, or we've evicted everything and there isn't enough space.
  */
 static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
 					uint32_t mem_type,
 					struct ttm_placement *placement,
 					struct ttm_mem_reg *mem,
 					bool interruptible,
 					bool no_wait_gpu)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
 	int ret;
 
 	do {
 		ret = (*man->func->get_node)(man, bo, placement, mem);
 		if (unlikely(ret != 0))
 			return ret;
 		if (mem->mm_node)
 			break;
 		ret = ttm_mem_evict_first(bdev, mem_type,
 					  interruptible, no_wait_gpu);
 		if (unlikely(ret != 0))
 			return ret;
 	} while (1);
 	if (mem->mm_node == NULL)
 		return -ENOMEM;
 	mem->mem_type = mem_type;
 	return 0;
 }
 
 static uint32_t ttm_bo_select_caching(struct ttm_mem_type_manager *man,
 				      uint32_t cur_placement,
 				      uint32_t proposed_placement)
 {
 	uint32_t caching = proposed_placement & TTM_PL_MASK_CACHING;
 	uint32_t result = proposed_placement & ~TTM_PL_MASK_CACHING;
 
 	/**
 	 * Keep current caching if possible.
 	 */
 
 	if ((cur_placement & caching) != 0)
 		result |= (cur_placement & caching);
 	else if ((man->default_caching & caching) != 0)
 		result |= man->default_caching;
 	else if ((TTM_PL_FLAG_CACHED & caching) != 0)
 		result |= TTM_PL_FLAG_CACHED;
 	else if ((TTM_PL_FLAG_WC & caching) != 0)
 		result |= TTM_PL_FLAG_WC;
 	else if ((TTM_PL_FLAG_UNCACHED & caching) != 0)
 		result |= TTM_PL_FLAG_UNCACHED;
 
 	return result;
 }
 
 static bool ttm_bo_mt_compatible(struct ttm_mem_type_manager *man,
 				 uint32_t mem_type,
 				 uint32_t proposed_placement,
 				 uint32_t *masked_placement)
 {
 	uint32_t cur_flags = ttm_bo_type_flags(mem_type);
 
 	if ((cur_flags & proposed_placement & TTM_PL_MASK_MEM) == 0)
 		return false;
 
 	if ((proposed_placement & man->available_caching) == 0)
 		return false;
 
 	cur_flags |= (proposed_placement & man->available_caching);
 
 	*masked_placement = cur_flags;
 	return true;
 }
 
 /**
  * Creates space for memory region @mem according to its type.
  *
  * This function first searches for free space in compatible memory types in
  * the priority order defined by the driver.  If free space isn't found, then
  * ttm_bo_mem_force_space is attempted in priority order to evict and find
  * space.
  */
 int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 			struct ttm_placement *placement,
 			struct ttm_mem_reg *mem,
 			bool interruptible,
 			bool no_wait_gpu)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_mem_type_manager *man;
 	uint32_t mem_type = TTM_PL_SYSTEM;
 	uint32_t cur_flags = 0;
 	bool type_found = false;
 	bool type_ok = false;
 	bool has_erestartsys = false;
 	int i, ret;
 
 	mem->mm_node = NULL;
 	for (i = 0; i < placement->num_placement; ++i) {
 		ret = ttm_mem_type_from_flags(placement->placement[i],
 						&mem_type);
 		if (ret)
 			return ret;
 		man = &bdev->man[mem_type];
 
 		type_ok = ttm_bo_mt_compatible(man,
 						mem_type,
 						placement->placement[i],
 						&cur_flags);
 
 		if (!type_ok)
 			continue;
 
 		cur_flags = ttm_bo_select_caching(man, bo->mem.placement,
 						  cur_flags);
 		/*
 		 * Use the access and other non-mapping-related flag bits from
 		 * the memory placement flags to the current flags
 		 */
 		ttm_flag_masked(&cur_flags, placement->placement[i],
 				~TTM_PL_MASK_MEMTYPE);
 
 		if (mem_type == TTM_PL_SYSTEM)
 			break;
 
 		if (man->has_type && man->use_type) {
 			type_found = true;
 			ret = (*man->func->get_node)(man, bo, placement, mem);
 			if (unlikely(ret))
 				return ret;
 		}
 		if (mem->mm_node)
 			break;
 	}
 
 	if ((type_ok && (mem_type == TTM_PL_SYSTEM)) || mem->mm_node) {
 		mem->mem_type = mem_type;
 		mem->placement = cur_flags;
 		return 0;
 	}
 
 	if (!type_found)
 		return -EINVAL;
 
 	for (i = 0; i < placement->num_busy_placement; ++i) {
 		ret = ttm_mem_type_from_flags(placement->busy_placement[i],
 						&mem_type);
 		if (ret)
 			return ret;
 		man = &bdev->man[mem_type];
 		if (!man->has_type)
 			continue;
 		if (!ttm_bo_mt_compatible(man,
 						mem_type,
 						placement->busy_placement[i],
 						&cur_flags))
 			continue;
 
 		cur_flags = ttm_bo_select_caching(man, bo->mem.placement,
 						  cur_flags);
 		/*
 		 * Use the access and other non-mapping-related flag bits from
 		 * the memory placement flags to the current flags
 		 */
 		ttm_flag_masked(&cur_flags, placement->busy_placement[i],
 				~TTM_PL_MASK_MEMTYPE);
 
 
 		if (mem_type == TTM_PL_SYSTEM) {
 			mem->mem_type = mem_type;
 			mem->placement = cur_flags;
 			mem->mm_node = NULL;
 			return 0;
 		}
 
 		ret = ttm_bo_mem_force_space(bo, mem_type, placement, mem,
 						interruptible, no_wait_gpu);
 		if (ret == 0 && mem->mm_node) {
 			mem->placement = cur_flags;
 			return 0;
 		}
 		if (ret == -ERESTARTSYS)
 			has_erestartsys = true;
 	}
 	ret = (has_erestartsys) ? -ERESTARTSYS : -ENOMEM;
 	return ret;
 }
 
 static
 int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
 			struct ttm_placement *placement,
 			bool interruptible,
 			bool no_wait_gpu)
 {
 	int ret = 0;
 	struct ttm_mem_reg mem;
 	struct ttm_bo_device *bdev = bo->bdev;
 
 	MPASS(ttm_bo_is_reserved(bo));
 
 	/*
 	 * FIXME: It's possible to pipeline buffer moves.
 	 * Have the driver move function wait for idle when necessary,
 	 * instead of doing it here.
 	 */
 	mtx_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
 	mtx_unlock(&bdev->fence_lock);
 	if (ret)
 		return ret;
 	mem.num_pages = bo->num_pages;
 	mem.size = mem.num_pages << PAGE_SHIFT;
 	mem.page_alignment = bo->mem.page_alignment;
 	mem.bus.io_reserved_vm = false;
 	mem.bus.io_reserved_count = 0;
 	/*
 	 * Determine where to move the buffer.
 	 */
 	ret = ttm_bo_mem_space(bo, placement, &mem,
 			       interruptible, no_wait_gpu);
 	if (ret)
 		goto out_unlock;
 	ret = ttm_bo_handle_move_mem(bo, &mem, false,
 				     interruptible, no_wait_gpu);
 out_unlock:
 	if (ret && mem.mm_node)
 		ttm_bo_mem_put(bo, &mem);
 	return ret;
 }
 
 static int ttm_bo_mem_compat(struct ttm_placement *placement,
 			     struct ttm_mem_reg *mem)
 {
 	int i;
 
 	if (mem->mm_node && placement->lpfn != 0 &&
 	    (mem->start < placement->fpfn ||
 	     mem->start + mem->num_pages > placement->lpfn))
 		return -1;
 
 	for (i = 0; i < placement->num_placement; i++) {
 		if ((placement->placement[i] & mem->placement &
 			TTM_PL_MASK_CACHING) &&
 			(placement->placement[i] & mem->placement &
 			TTM_PL_MASK_MEM))
 			return i;
 	}
 	return -1;
 }
 
 int ttm_bo_validate(struct ttm_buffer_object *bo,
 			struct ttm_placement *placement,
 			bool interruptible,
 			bool no_wait_gpu)
 {
 	int ret;
 
 	MPASS(ttm_bo_is_reserved(bo));
 	/* Check that range is valid */
 	if (placement->lpfn || placement->fpfn)
 		if (placement->fpfn > placement->lpfn ||
 			(placement->lpfn - placement->fpfn) < bo->num_pages)
 			return -EINVAL;
 	/*
 	 * Check whether we need to move buffer.
 	 */
 	ret = ttm_bo_mem_compat(placement, &bo->mem);
 	if (ret < 0) {
 		ret = ttm_bo_move_buffer(bo, placement, interruptible,
 					 no_wait_gpu);
 		if (ret)
 			return ret;
 	} else {
 		/*
 		 * Use the access and other non-mapping-related flag bits from
 		 * the compatible memory placement flags to the active flags
 		 */
 		ttm_flag_masked(&bo->mem.placement, placement->placement[ret],
 				~TTM_PL_MASK_MEMTYPE);
 	}
 	/*
 	 * We might need to add a TTM.
 	 */
 	if (bo->mem.mem_type == TTM_PL_SYSTEM && bo->ttm == NULL) {
 		ret = ttm_bo_add_ttm(bo, true);
 		if (ret)
 			return ret;
 	}
 	return 0;
 }
 
 int ttm_bo_check_placement(struct ttm_buffer_object *bo,
 				struct ttm_placement *placement)
 {
 	MPASS(!((placement->fpfn || placement->lpfn) &&
 	    (bo->mem.num_pages > (placement->lpfn - placement->fpfn))));
 
 	return 0;
 }
 
 int ttm_bo_init(struct ttm_bo_device *bdev,
 		struct ttm_buffer_object *bo,
 		unsigned long size,
 		enum ttm_bo_type type,
 		struct ttm_placement *placement,
 		uint32_t page_alignment,
 		bool interruptible,
 		struct vm_object *persistent_swap_storage,
 		size_t acc_size,
 		struct sg_table *sg,
 		void (*destroy) (struct ttm_buffer_object *))
 {
 	int ret = 0;
 	unsigned long num_pages;
 	struct ttm_mem_global *mem_glob = bdev->glob->mem_glob;
 
 	ret = ttm_mem_global_alloc(mem_glob, acc_size, false, false);
 	if (ret) {
 		printf("[TTM] Out of kernel memory\n");
 		if (destroy)
 			(*destroy)(bo);
 		else
 			free(bo, M_TTM_BO);
 		return -ENOMEM;
 	}
 
 	num_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (num_pages == 0) {
 		printf("[TTM] Illegal buffer object size\n");
 		if (destroy)
 			(*destroy)(bo);
 		else
 			free(bo, M_TTM_BO);
 		ttm_mem_global_free(mem_glob, acc_size);
 		return -EINVAL;
 	}
 	bo->destroy = destroy;
 
 	refcount_init(&bo->kref, 1);
 	refcount_init(&bo->list_kref, 1);
 	atomic_set(&bo->cpu_writers, 0);
 	atomic_set(&bo->reserved, 1);
 	INIT_LIST_HEAD(&bo->lru);
 	INIT_LIST_HEAD(&bo->ddestroy);
 	INIT_LIST_HEAD(&bo->swap);
 	INIT_LIST_HEAD(&bo->io_reserve_lru);
 	bo->bdev = bdev;
 	bo->glob = bdev->glob;
 	bo->type = type;
 	bo->num_pages = num_pages;
 	bo->mem.size = num_pages << PAGE_SHIFT;
 	bo->mem.mem_type = TTM_PL_SYSTEM;
 	bo->mem.num_pages = bo->num_pages;
 	bo->mem.mm_node = NULL;
 	bo->mem.page_alignment = page_alignment;
 	bo->mem.bus.io_reserved_vm = false;
 	bo->mem.bus.io_reserved_count = 0;
 	bo->priv_flags = 0;
 	bo->mem.placement = (TTM_PL_FLAG_SYSTEM | TTM_PL_FLAG_CACHED);
 	bo->seq_valid = false;
 	bo->persistent_swap_storage = persistent_swap_storage;
 	bo->acc_size = acc_size;
 	bo->sg = sg;
 	atomic_inc(&bo->glob->bo_count);
 
 	ret = ttm_bo_check_placement(bo, placement);
 	if (unlikely(ret != 0))
 		goto out_err;
 
 	/*
 	 * For ttm_bo_type_device buffers, allocate
 	 * address space from the device.
 	 */
 	if (bo->type == ttm_bo_type_device ||
 	    bo->type == ttm_bo_type_sg) {
 		ret = ttm_bo_setup_vm(bo);
 		if (ret)
 			goto out_err;
 	}
 
 	ret = ttm_bo_validate(bo, placement, interruptible, false);
 	if (ret)
 		goto out_err;
 
 	ttm_bo_unreserve(bo);
 	return 0;
 
 out_err:
 	ttm_bo_unreserve(bo);
 	ttm_bo_unref(&bo);
 
 	return ret;
 }
 
 size_t ttm_bo_acc_size(struct ttm_bo_device *bdev,
 		       unsigned long bo_size,
 		       unsigned struct_size)
 {
 	unsigned npages = (PAGE_ALIGN(bo_size)) >> PAGE_SHIFT;
 	size_t size = 0;
 
 	size += ttm_round_pot(struct_size);
 	size += PAGE_ALIGN(npages * sizeof(void *));
 	size += ttm_round_pot(sizeof(struct ttm_tt));
 	return size;
 }
 
 size_t ttm_bo_dma_acc_size(struct ttm_bo_device *bdev,
 			   unsigned long bo_size,
 			   unsigned struct_size)
 {
 	unsigned npages = (PAGE_ALIGN(bo_size)) >> PAGE_SHIFT;
 	size_t size = 0;
 
 	size += ttm_round_pot(struct_size);
 	size += PAGE_ALIGN(npages * sizeof(void *));
 	size += PAGE_ALIGN(npages * sizeof(dma_addr_t));
 	size += ttm_round_pot(sizeof(struct ttm_dma_tt));
 	return size;
 }
 
 int ttm_bo_create(struct ttm_bo_device *bdev,
 			unsigned long size,
 			enum ttm_bo_type type,
 			struct ttm_placement *placement,
 			uint32_t page_alignment,
 			bool interruptible,
 			struct vm_object *persistent_swap_storage,
 			struct ttm_buffer_object **p_bo)
 {
 	struct ttm_buffer_object *bo;
 	size_t acc_size;
 	int ret;
 
 	bo = malloc(sizeof(*bo), M_TTM_BO, M_WAITOK | M_ZERO);
 	acc_size = ttm_bo_acc_size(bdev, size, sizeof(struct ttm_buffer_object));
 	ret = ttm_bo_init(bdev, bo, size, type, placement, page_alignment,
 			  interruptible, persistent_swap_storage, acc_size,
 			  NULL, NULL);
 	if (likely(ret == 0))
 		*p_bo = bo;
 
 	return ret;
 }
 
 static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
 					unsigned mem_type, bool allow_errors)
 {
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
 	struct ttm_bo_global *glob = bdev->glob;
 	int ret;
 
 	/*
 	 * Can't use standard list traversal since we're unlocking.
 	 */
 
 	mtx_lock(&glob->lru_lock);
 	while (!list_empty(&man->lru)) {
 		mtx_unlock(&glob->lru_lock);
 		ret = ttm_mem_evict_first(bdev, mem_type, false, false);
 		if (ret) {
 			if (allow_errors) {
 				return ret;
 			} else {
 				printf("[TTM] Cleanup eviction failed\n");
 			}
 		}
 		mtx_lock(&glob->lru_lock);
 	}
 	mtx_unlock(&glob->lru_lock);
 	return 0;
 }
 
 int ttm_bo_clean_mm(struct ttm_bo_device *bdev, unsigned mem_type)
 {
 	struct ttm_mem_type_manager *man;
 	int ret = -EINVAL;
 
 	if (mem_type >= TTM_NUM_MEM_TYPES) {
 		printf("[TTM] Illegal memory type %d\n", mem_type);
 		return ret;
 	}
 	man = &bdev->man[mem_type];
 
 	if (!man->has_type) {
 		printf("[TTM] Trying to take down uninitialized memory manager type %u\n",
 		       mem_type);
 		return ret;
 	}
 
 	man->use_type = false;
 	man->has_type = false;
 
 	ret = 0;
 	if (mem_type > 0) {
 		ttm_bo_force_list_clean(bdev, mem_type, false);
 
 		ret = (*man->func->takedown)(man);
 	}
 
 	return ret;
 }
 
 int ttm_bo_evict_mm(struct ttm_bo_device *bdev, unsigned mem_type)
 {
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
 
 	if (mem_type == 0 || mem_type >= TTM_NUM_MEM_TYPES) {
 		printf("[TTM] Illegal memory manager memory type %u\n", mem_type);
 		return -EINVAL;
 	}
 
 	if (!man->has_type) {
 		printf("[TTM] Memory type %u has not been initialized\n", mem_type);
 		return 0;
 	}
 
 	return ttm_bo_force_list_clean(bdev, mem_type, true);
 }
 
 int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type,
 			unsigned long p_size)
 {
 	int ret = -EINVAL;
 	struct ttm_mem_type_manager *man;
 
 	MPASS(type < TTM_NUM_MEM_TYPES);
 	man = &bdev->man[type];
 	MPASS(!man->has_type);
 	man->io_reserve_fastpath = true;
 	man->use_io_reserve_lru = false;
 	sx_init(&man->io_reserve_mutex, "ttmman");
 	INIT_LIST_HEAD(&man->io_reserve_lru);
 
 	ret = bdev->driver->init_mem_type(bdev, type, man);
 	if (ret)
 		return ret;
 	man->bdev = bdev;
 
 	ret = 0;
 	if (type != TTM_PL_SYSTEM) {
 		ret = (*man->func->init)(man, p_size);
 		if (ret)
 			return ret;
 	}
 	man->has_type = true;
 	man->use_type = true;
 	man->size = p_size;
 
 	INIT_LIST_HEAD(&man->lru);
 
 	return 0;
 }
 
 static void ttm_bo_global_kobj_release(struct ttm_bo_global *glob)
 {
 
 	ttm_mem_unregister_shrink(glob->mem_glob, &glob->shrink);
 	vm_page_free(glob->dummy_read_page);
 }
 
 void ttm_bo_global_release(struct drm_global_reference *ref)
 {
 	struct ttm_bo_global *glob = ref->object;
 
 	if (refcount_release(&glob->kobj_ref))
 		ttm_bo_global_kobj_release(glob);
 }
 
 int ttm_bo_global_init(struct drm_global_reference *ref)
 {
 	struct ttm_bo_global_ref *bo_ref =
 		container_of(ref, struct ttm_bo_global_ref, ref);
 	struct ttm_bo_global *glob = ref->object;
-	int req, ret;
+	int ret;
 	int tries;
 
 	sx_init(&glob->device_list_mutex, "ttmdlm");
 	mtx_init(&glob->lru_lock, "ttmlru", NULL, MTX_DEF);
 	glob->mem_glob = bo_ref->mem_glob;
-	req = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ;
 	tries = 0;
 retry:
-	glob->dummy_read_page = vm_page_alloc_contig(NULL, 0, req,
-	    1, 0, VM_MAX_ADDRESS, PAGE_SIZE, 0, VM_MEMATTR_UNCACHEABLE);
+	glob->dummy_read_page = vm_page_alloc_noobj_contig(0, 1, 0,
+	    VM_MAX_ADDRESS, PAGE_SIZE, 0, VM_MEMATTR_UNCACHEABLE);
 
 	if (unlikely(glob->dummy_read_page == NULL)) {
-		if (tries < 1 && vm_page_reclaim_contig(req, 1,
-		    0, VM_MAX_ADDRESS, PAGE_SIZE, 0)) {
+		if (tries < 1 && vm_page_reclaim_contig(0, 1, 0,
+		    VM_MAX_ADDRESS, PAGE_SIZE, 0)) {
 			tries++;
 			goto retry;
 		}
 		ret = -ENOMEM;
 		goto out_no_drp;
 	}
 
 	INIT_LIST_HEAD(&glob->swap_lru);
 	INIT_LIST_HEAD(&glob->device_list);
 
 	ttm_mem_init_shrink(&glob->shrink, ttm_bo_swapout);
 	ret = ttm_mem_register_shrink(glob->mem_glob, &glob->shrink);
 	if (unlikely(ret != 0)) {
 		printf("[TTM] Could not register buffer object swapout\n");
 		goto out_no_shrink;
 	}
 
 	atomic_set(&glob->bo_count, 0);
 
 	refcount_init(&glob->kobj_ref, 1);
 	return (0);
 
 out_no_shrink:
 	vm_page_free(glob->dummy_read_page);
 out_no_drp:
 	free(glob, M_DRM_GLOBAL);
 	return ret;
 }
 
 int ttm_bo_device_release(struct ttm_bo_device *bdev)
 {
 	int ret = 0;
 	unsigned i = TTM_NUM_MEM_TYPES;
 	struct ttm_mem_type_manager *man;
 	struct ttm_bo_global *glob = bdev->glob;
 
 	while (i--) {
 		man = &bdev->man[i];
 		if (man->has_type) {
 			man->use_type = false;
 			if ((i != TTM_PL_SYSTEM) && ttm_bo_clean_mm(bdev, i)) {
 				ret = -EBUSY;
 				printf("[TTM] DRM memory manager type %d is not clean\n",
 				       i);
 			}
 			man->has_type = false;
 		}
 	}
 
 	sx_xlock(&glob->device_list_mutex);
 	list_del(&bdev->device_list);
 	sx_xunlock(&glob->device_list_mutex);
 
 	if (taskqueue_cancel_timeout(taskqueue_thread, &bdev->wq, NULL))
 		taskqueue_drain_timeout(taskqueue_thread, &bdev->wq);
 
 	while (ttm_bo_delayed_delete(bdev, true))
 		;
 
 	mtx_lock(&glob->lru_lock);
 	if (list_empty(&bdev->ddestroy))
 		TTM_DEBUG("Delayed destroy list was clean\n");
 
 	if (list_empty(&bdev->man[0].lru))
 		TTM_DEBUG("Swap list was clean\n");
 	mtx_unlock(&glob->lru_lock);
 
 	MPASS(drm_mm_clean(&bdev->addr_space_mm));
 	rw_wlock(&bdev->vm_lock);
 	drm_mm_takedown(&bdev->addr_space_mm);
 	rw_wunlock(&bdev->vm_lock);
 
 	return ret;
 }
 
 int ttm_bo_device_init(struct ttm_bo_device *bdev,
 		       struct ttm_bo_global *glob,
 		       struct ttm_bo_driver *driver,
 		       uint64_t file_page_offset,
 		       bool need_dma32)
 {
 	int ret = -EINVAL;
 
 	rw_init(&bdev->vm_lock, "ttmvml");
 	bdev->driver = driver;
 
 	memset(bdev->man, 0, sizeof(bdev->man));
 
 	/*
 	 * Initialize the system memory buffer type.
 	 * Other types need to be driver / IOCTL initialized.
 	 */
 	ret = ttm_bo_init_mm(bdev, TTM_PL_SYSTEM, 0);
 	if (unlikely(ret != 0))
 		goto out_no_sys;
 
 	RB_INIT(&bdev->addr_space_rb);
 	ret = drm_mm_init(&bdev->addr_space_mm, file_page_offset, 0x10000000);
 	if (unlikely(ret != 0))
 		goto out_no_addr_mm;
 
 	TIMEOUT_TASK_INIT(taskqueue_thread, &bdev->wq, 0,
 	    ttm_bo_delayed_workqueue, bdev);
 	INIT_LIST_HEAD(&bdev->ddestroy);
 	bdev->dev_mapping = NULL;
 	bdev->glob = glob;
 	bdev->need_dma32 = need_dma32;
 	bdev->val_seq = 0;
 	mtx_init(&bdev->fence_lock, "ttmfence", NULL, MTX_DEF);
 	sx_xlock(&glob->device_list_mutex);
 	list_add_tail(&bdev->device_list, &glob->device_list);
 	sx_xunlock(&glob->device_list_mutex);
 
 	return 0;
 out_no_addr_mm:
 	ttm_bo_clean_mm(bdev, 0);
 out_no_sys:
 	return ret;
 }
 
 /*
  * buffer object vm functions.
  */
 
 bool ttm_mem_reg_is_pci(struct ttm_bo_device *bdev, struct ttm_mem_reg *mem)
 {
 	struct ttm_mem_type_manager *man = &bdev->man[mem->mem_type];
 
 	if (!(man->flags & TTM_MEMTYPE_FLAG_FIXED)) {
 		if (mem->mem_type == TTM_PL_SYSTEM)
 			return false;
 
 		if (man->flags & TTM_MEMTYPE_FLAG_CMA)
 			return false;
 
 		if (mem->placement & TTM_PL_FLAG_CACHED)
 			return false;
 	}
 	return true;
 }
 
 void ttm_bo_unmap_virtual_locked(struct ttm_buffer_object *bo)
 {
 
 	ttm_bo_release_mmap(bo);
 	ttm_mem_io_free_vm(bo);
 }
 
 void ttm_bo_unmap_virtual(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_mem_type_manager *man = &bdev->man[bo->mem.mem_type];
 
 	ttm_mem_io_lock(man, false);
 	ttm_bo_unmap_virtual_locked(bo);
 	ttm_mem_io_unlock(man);
 }
 
 static void ttm_bo_vm_insert_rb(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 
 	/* The caller acquired bdev->vm_lock. */
 	RB_INSERT(ttm_bo_device_buffer_objects, &bdev->addr_space_rb, bo);
 }
 
 /**
  * ttm_bo_setup_vm:
  *
  * @bo: the buffer to allocate address space for
  *
  * Allocate address space in the drm device so that applications
  * can mmap the buffer and access the contents. This only
  * applies to ttm_bo_type_device objects as others are not
  * placed in the drm device address space.
  */
 
 static int ttm_bo_setup_vm(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	int ret;
 
 retry_pre_get:
 	ret = drm_mm_pre_get(&bdev->addr_space_mm);
 	if (unlikely(ret != 0))
 		return ret;
 
 	rw_wlock(&bdev->vm_lock);
 	bo->vm_node = drm_mm_search_free(&bdev->addr_space_mm,
 					 bo->mem.num_pages, 0, 0);
 
 	if (unlikely(bo->vm_node == NULL)) {
 		ret = -ENOMEM;
 		goto out_unlock;
 	}
 
 	bo->vm_node = drm_mm_get_block_atomic(bo->vm_node,
 					      bo->mem.num_pages, 0);
 
 	if (unlikely(bo->vm_node == NULL)) {
 		rw_wunlock(&bdev->vm_lock);
 		goto retry_pre_get;
 	}
 
 	ttm_bo_vm_insert_rb(bo);
 	rw_wunlock(&bdev->vm_lock);
 	bo->addr_space_offset = ((uint64_t) bo->vm_node->start) << PAGE_SHIFT;
 
 	return 0;
 out_unlock:
 	rw_wunlock(&bdev->vm_lock);
 	return ret;
 }
 
 int ttm_bo_wait(struct ttm_buffer_object *bo,
 		bool lazy, bool interruptible, bool no_wait)
 {
 	struct ttm_bo_driver *driver = bo->bdev->driver;
 	struct ttm_bo_device *bdev = bo->bdev;
 	void *sync_obj;
 	int ret = 0;
 
 	if (likely(bo->sync_obj == NULL))
 		return 0;
 
 	while (bo->sync_obj) {
 
 		if (driver->sync_obj_signaled(bo->sync_obj)) {
 			void *tmp_obj = bo->sync_obj;
 			bo->sync_obj = NULL;
 			clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
 			mtx_unlock(&bdev->fence_lock);
 			driver->sync_obj_unref(&tmp_obj);
 			mtx_lock(&bdev->fence_lock);
 			continue;
 		}
 
 		if (no_wait)
 			return -EBUSY;
 
 		sync_obj = driver->sync_obj_ref(bo->sync_obj);
 		mtx_unlock(&bdev->fence_lock);
 		ret = driver->sync_obj_wait(sync_obj,
 					    lazy, interruptible);
 		if (unlikely(ret != 0)) {
 			driver->sync_obj_unref(&sync_obj);
 			mtx_lock(&bdev->fence_lock);
 			return ret;
 		}
 		mtx_lock(&bdev->fence_lock);
 		if (likely(bo->sync_obj == sync_obj)) {
 			void *tmp_obj = bo->sync_obj;
 			bo->sync_obj = NULL;
 			clear_bit(TTM_BO_PRIV_FLAG_MOVING,
 				  &bo->priv_flags);
 			mtx_unlock(&bdev->fence_lock);
 			driver->sync_obj_unref(&sync_obj);
 			driver->sync_obj_unref(&tmp_obj);
 			mtx_lock(&bdev->fence_lock);
 		} else {
 			mtx_unlock(&bdev->fence_lock);
 			driver->sync_obj_unref(&sync_obj);
 			mtx_lock(&bdev->fence_lock);
 		}
 	}
 	return 0;
 }
 
 int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	int ret = 0;
 
 	/*
 	 * Using ttm_bo_reserve makes sure the lru lists are updated.
 	 */
 
 	ret = ttm_bo_reserve(bo, true, no_wait, false, 0);
 	if (unlikely(ret != 0))
 		return ret;
 	mtx_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, true, no_wait);
 	mtx_unlock(&bdev->fence_lock);
 	if (likely(ret == 0))
 		atomic_inc(&bo->cpu_writers);
 	ttm_bo_unreserve(bo);
 	return ret;
 }
 
 void ttm_bo_synccpu_write_release(struct ttm_buffer_object *bo)
 {
 	atomic_dec(&bo->cpu_writers);
 }
 
 /**
  * A buffer object shrink method that tries to swap out the first
  * buffer object on the bo_global::swap_lru list.
  */
 
 static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
 {
 	struct ttm_bo_global *glob =
 	    container_of(shrink, struct ttm_bo_global, shrink);
 	struct ttm_buffer_object *bo;
 	int ret = -EBUSY;
 	int put_count;
 	uint32_t swap_placement = (TTM_PL_FLAG_CACHED | TTM_PL_FLAG_SYSTEM);
 
 	mtx_lock(&glob->lru_lock);
 	list_for_each_entry(bo, &glob->swap_lru, swap) {
 		ret = ttm_bo_reserve_nolru(bo, false, true, false, 0);
 		if (!ret)
 			break;
 	}
 
 	if (ret) {
 		mtx_unlock(&glob->lru_lock);
 		return ret;
 	}
 
 	refcount_acquire(&bo->list_kref);
 
 	if (!list_empty(&bo->ddestroy)) {
 		ret = ttm_bo_cleanup_refs_and_unlock(bo, false, false);
 		if (refcount_release(&bo->list_kref))
 			ttm_bo_release_list(bo);
 		return ret;
 	}
 
 	put_count = ttm_bo_del_from_lru(bo);
 	mtx_unlock(&glob->lru_lock);
 
 	ttm_bo_list_ref_sub(bo, put_count, true);
 
 	/**
 	 * Wait for GPU, then move to system cached.
 	 */
 
 	mtx_lock(&bo->bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, false, false);
 	mtx_unlock(&bo->bdev->fence_lock);
 
 	if (unlikely(ret != 0))
 		goto out;
 
 	if ((bo->mem.placement & swap_placement) != swap_placement) {
 		struct ttm_mem_reg evict_mem;
 
 		evict_mem = bo->mem;
 		evict_mem.mm_node = NULL;
 		evict_mem.placement = TTM_PL_FLAG_SYSTEM | TTM_PL_FLAG_CACHED;
 		evict_mem.mem_type = TTM_PL_SYSTEM;
 
 		ret = ttm_bo_handle_move_mem(bo, &evict_mem, true,
 					     false, false);
 		if (unlikely(ret != 0))
 			goto out;
 	}
 
 	ttm_bo_unmap_virtual(bo);
 
 	/**
 	 * Swap out. Buffer will be swapped in again as soon as
 	 * anyone tries to access a ttm page.
 	 */
 
 	if (bo->bdev->driver->swap_notify)
 		bo->bdev->driver->swap_notify(bo);
 
 	ret = ttm_tt_swapout(bo->ttm, bo->persistent_swap_storage);
 out:
 
 	/**
 	 *
 	 * Unreserve without putting on LRU to avoid swapping out an
 	 * already swapped buffer.
 	 */
 
 	atomic_set(&bo->reserved, 0);
 	wakeup(bo);
 	if (refcount_release(&bo->list_kref))
 		ttm_bo_release_list(bo);
 	return ret;
 }
 
 void ttm_bo_swapout_all(struct ttm_bo_device *bdev)
 {
 	while (ttm_bo_swapout(&bdev->glob->shrink) == 0)
 		;
 }
diff --git a/sys/dev/drm2/ttm/ttm_page_alloc.c b/sys/dev/drm2/ttm/ttm_page_alloc.c
index b35a06520e07..6fc12cad121f 100644
--- a/sys/dev/drm2/ttm/ttm_page_alloc.c
+++ b/sys/dev/drm2/ttm/ttm_page_alloc.c
@@ -1,921 +1,921 @@
 /*
  * Copyright (c) Red Hat Inc.
 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sub license,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
  * Authors: Dave Airlie <airlied@redhat.com>
  *          Jerome Glisse <jglisse@redhat.com>
  *          Pauli Nieminen <suokkos@gmail.com>
  */
 /*
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * <kib@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
  */
 
 /* simple list based uncached page pool
  * - Pool collects resently freed pages for reuse
  * - Use page->lru to keep a free list
  * - doesn't track currently in use pages
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <dev/drm2/drmP.h>
 #include <dev/drm2/ttm/ttm_bo_driver.h>
 #include <dev/drm2/ttm/ttm_page_alloc.h>
 #include <sys/eventhandler.h>
 #include <vm/vm_pageout.h>
 
 #define NUM_PAGES_TO_ALLOC		(PAGE_SIZE/sizeof(vm_page_t))
 #define SMALL_ALLOCATION		16
 #define FREE_ALL_PAGES			(~0U)
 /* times are in msecs */
 #define PAGE_FREE_INTERVAL		1000
 
 /**
  * struct ttm_page_pool - Pool to reuse recently allocated uc/wc pages.
  *
  * @lock: Protects the shared pool from concurrnet access. Must be used with
  * irqsave/irqrestore variants because pool allocator maybe called from
  * delayed work.
  * @fill_lock: Prevent concurrent calls to fill.
  * @list: Pool of free uc/wc pages for fast reuse.
  * @gfp_flags: Flags to pass for alloc_page.
  * @npages: Number of pages in pool.
  */
 struct ttm_page_pool {
 	struct mtx		lock;
 	bool			fill_lock;
 	bool			dma32;
 	struct pglist		list;
 	int			ttm_page_alloc_flags;
 	unsigned		npages;
 	char			*name;
 	unsigned long		nfrees;
 	unsigned long		nrefills;
 };
 
 /**
  * Limits for the pool. They are handled without locks because only place where
  * they may change is in sysfs store. They won't have immediate effect anyway
  * so forcing serialization to access them is pointless.
  */
 
 struct ttm_pool_opts {
 	unsigned	alloc_size;
 	unsigned	max_size;
 	unsigned	small;
 };
 
 #define NUM_POOLS 4
 
 /**
  * struct ttm_pool_manager - Holds memory pools for fst allocation
  *
  * Manager is read only object for pool code so it doesn't need locking.
  *
  * @free_interval: minimum number of jiffies between freeing pages from pool.
  * @page_alloc_inited: reference counting for pool allocation.
  * @work: Work that is used to shrink the pool. Work is only run when there is
  * some pages to free.
  * @small_allocation: Limit in number of pages what is small allocation.
  *
  * @pools: All pool objects in use.
  **/
 struct ttm_pool_manager {
 	unsigned int kobj_ref;
 	eventhandler_tag lowmem_handler;
 	struct ttm_pool_opts	options;
 
 	union {
 		struct ttm_page_pool	u_pools[NUM_POOLS];
 		struct _utag {
 			struct ttm_page_pool	u_wc_pool;
 			struct ttm_page_pool	u_uc_pool;
 			struct ttm_page_pool	u_wc_pool_dma32;
 			struct ttm_page_pool	u_uc_pool_dma32;
 		} _ut;
 	} _u;
 };
 
 #define	pools _u.u_pools
 #define	wc_pool _u._ut.u_wc_pool
 #define	uc_pool _u._ut.u_uc_pool
 #define	wc_pool_dma32 _u._ut.u_wc_pool_dma32
 #define	uc_pool_dma32 _u._ut.u_uc_pool_dma32
 
 MALLOC_DEFINE(M_TTM_POOLMGR, "ttm_poolmgr", "TTM Pool Manager");
 
 static void
 ttm_vm_page_free(vm_page_t m)
 {
 
 	KASSERT(m->object == NULL, ("ttm page %p is owned", m));
 	KASSERT(vm_page_wired(m), ("ttm lost wire %p", m));
 	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("ttm lost fictitious %p", m));
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("ttm got unmanaged %p", m));
 	m->flags &= ~PG_FICTITIOUS;
 	m->oflags |= VPO_UNMANAGED;
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 static vm_memattr_t
 ttm_caching_state_to_vm(enum ttm_caching_state cstate)
 {
 
 	switch (cstate) {
 	case tt_uncached:
 		return (VM_MEMATTR_UNCACHEABLE);
 	case tt_wc:
 		return (VM_MEMATTR_WRITE_COMBINING);
 	case tt_cached:
 		return (VM_MEMATTR_WRITE_BACK);
 	}
 	panic("caching state %d\n", cstate);
 }
 
 static vm_page_t
 ttm_vm_page_alloc_dma32(int req, vm_memattr_t memattr)
 {
 	vm_page_t p;
 	int tries;
 
 	for (tries = 0; ; tries++) {
-		p = vm_page_alloc_contig(NULL, 0, req, 1, 0, 0xffffffff,
-		    PAGE_SIZE, 0, memattr);
+		p = vm_page_alloc_noobj_contig(req, 1, 0, 0xffffffff, PAGE_SIZE,
+		    0, memattr);
 		if (p != NULL || tries > 2)
 			return (p);
 		if (!vm_page_reclaim_contig(req, 1, 0, 0xffffffff,
 		    PAGE_SIZE, 0))
 			vm_wait(NULL);
 	}
 }
 
 static vm_page_t
 ttm_vm_page_alloc_any(int req, vm_memattr_t memattr)
 {
 	vm_page_t p;
 
 	p = vm_page_alloc_noobj(req | VM_ALLOC_WAITOK);
 	pmap_page_set_memattr(p, memattr);
 	return (p);
 }
 
 static vm_page_t
 ttm_vm_page_alloc(int flags, enum ttm_caching_state cstate)
 {
 	vm_page_t p;
 	vm_memattr_t memattr;
 	int req;
 
 	memattr = ttm_caching_state_to_vm(cstate);
-	req = VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ;
+	req = VM_ALLOC_WIRED;
 	if ((flags & TTM_PAGE_FLAG_ZERO_ALLOC) != 0)
 		req |= VM_ALLOC_ZERO;
 
 	if ((flags & TTM_PAGE_FLAG_DMA32) != 0)
 		p = ttm_vm_page_alloc_dma32(req, memattr);
 	else
 		p = ttm_vm_page_alloc_any(req, memattr);
 
 	if (p != NULL) {
 		p->oflags &= ~VPO_UNMANAGED;
 		p->flags |= PG_FICTITIOUS;
 	}
 	return (p);
 }
 
 static void ttm_pool_kobj_release(struct ttm_pool_manager *m)
 {
 
 	free(m, M_TTM_POOLMGR);
 }
 
 #if 0
 /* XXXKIB sysctl */
 static ssize_t ttm_pool_store(struct ttm_pool_manager *m,
 		struct attribute *attr, const char *buffer, size_t size)
 {
 	int chars;
 	unsigned val;
 	chars = sscanf(buffer, "%u", &val);
 	if (chars == 0)
 		return size;
 
 	/* Convert kb to number of pages */
 	val = val / (PAGE_SIZE >> 10);
 
 	if (attr == &ttm_page_pool_max)
 		m->options.max_size = val;
 	else if (attr == &ttm_page_pool_small)
 		m->options.small = val;
 	else if (attr == &ttm_page_pool_alloc_size) {
 		if (val > NUM_PAGES_TO_ALLOC*8) {
 			pr_err("Setting allocation size to %lu is not allowed. Recommended size is %lu\n",
 			       NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 7),
 			       NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 10));
 			return size;
 		} else if (val > NUM_PAGES_TO_ALLOC) {
 			pr_warn("Setting allocation size to larger than %lu is not recommended\n",
 				NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 10));
 		}
 		m->options.alloc_size = val;
 	}
 
 	return size;
 }
 
 static ssize_t ttm_pool_show(struct ttm_pool_manager *m,
 		struct attribute *attr, char *buffer)
 {
 	unsigned val = 0;
 
 	if (attr == &ttm_page_pool_max)
 		val = m->options.max_size;
 	else if (attr == &ttm_page_pool_small)
 		val = m->options.small;
 	else if (attr == &ttm_page_pool_alloc_size)
 		val = m->options.alloc_size;
 
 	val = val * (PAGE_SIZE >> 10);
 
 	return snprintf(buffer, PAGE_SIZE, "%u\n", val);
 }
 #endif
 
 static struct ttm_pool_manager *_manager;
 
 static int set_pages_array_wb(vm_page_t *pages, int addrinarray)
 {
 #ifdef TTM_HAS_AGP
 	int i;
 
 	for (i = 0; i < addrinarray; i++)
 		pmap_page_set_memattr(pages[i], VM_MEMATTR_WRITE_BACK);
 #endif
 	return 0;
 }
 
 static int set_pages_array_wc(vm_page_t *pages, int addrinarray)
 {
 #ifdef TTM_HAS_AGP
 	int i;
 
 	for (i = 0; i < addrinarray; i++)
 		pmap_page_set_memattr(pages[i], VM_MEMATTR_WRITE_COMBINING);
 #endif
 	return 0;
 }
 
 static int set_pages_array_uc(vm_page_t *pages, int addrinarray)
 {
 #ifdef TTM_HAS_AGP
 	int i;
 
 	for (i = 0; i < addrinarray; i++)
 		pmap_page_set_memattr(pages[i], VM_MEMATTR_UNCACHEABLE);
 #endif
 	return 0;
 }
 
 /**
  * Select the right pool or requested caching state and ttm flags. */
 static struct ttm_page_pool *ttm_get_pool(int flags,
 		enum ttm_caching_state cstate)
 {
 	int pool_index;
 
 	if (cstate == tt_cached)
 		return NULL;
 
 	if (cstate == tt_wc)
 		pool_index = 0x0;
 	else
 		pool_index = 0x1;
 
 	if (flags & TTM_PAGE_FLAG_DMA32)
 		pool_index |= 0x2;
 
 	return &_manager->pools[pool_index];
 }
 
 /* set memory back to wb and free the pages. */
 static void ttm_pages_put(vm_page_t *pages, unsigned npages)
 {
 	unsigned i;
 
 	/* Our VM handles vm memattr automatically on the page free. */
 	if (set_pages_array_wb(pages, npages))
 		printf("[TTM] Failed to set %d pages to wb!\n", npages);
 	for (i = 0; i < npages; ++i)
 		ttm_vm_page_free(pages[i]);
 }
 
 static void ttm_pool_update_free_locked(struct ttm_page_pool *pool,
 		unsigned freed_pages)
 {
 	pool->npages -= freed_pages;
 	pool->nfrees += freed_pages;
 }
 
 /**
  * Free pages from pool.
  *
  * To prevent hogging the ttm_swap process we only free NUM_PAGES_TO_ALLOC
  * number of pages in one go.
  *
  * @pool: to free the pages from
  * @free_all: If set to true will free all pages in pool
  **/
 static int ttm_page_pool_free(struct ttm_page_pool *pool, unsigned nr_free)
 {
 	vm_page_t p, p1;
 	vm_page_t *pages_to_free;
 	unsigned freed_pages = 0,
 		 npages_to_free = nr_free;
 	unsigned i;
 
 	if (NUM_PAGES_TO_ALLOC < nr_free)
 		npages_to_free = NUM_PAGES_TO_ALLOC;
 
 	pages_to_free = malloc(npages_to_free * sizeof(vm_page_t),
 	    M_TEMP, M_WAITOK | M_ZERO);
 
 restart:
 	mtx_lock(&pool->lock);
 
 	TAILQ_FOREACH_REVERSE_SAFE(p, &pool->list, pglist, plinks.q, p1) {
 		if (freed_pages >= npages_to_free)
 			break;
 
 		pages_to_free[freed_pages++] = p;
 		/* We can only remove NUM_PAGES_TO_ALLOC at a time. */
 		if (freed_pages >= NUM_PAGES_TO_ALLOC) {
 			/* remove range of pages from the pool */
 			for (i = 0; i < freed_pages; i++)
 				TAILQ_REMOVE(&pool->list, pages_to_free[i], plinks.q);
 
 			ttm_pool_update_free_locked(pool, freed_pages);
 			/**
 			 * Because changing page caching is costly
 			 * we unlock the pool to prevent stalling.
 			 */
 			mtx_unlock(&pool->lock);
 
 			ttm_pages_put(pages_to_free, freed_pages);
 			if (likely(nr_free != FREE_ALL_PAGES))
 				nr_free -= freed_pages;
 
 			if (NUM_PAGES_TO_ALLOC >= nr_free)
 				npages_to_free = nr_free;
 			else
 				npages_to_free = NUM_PAGES_TO_ALLOC;
 
 			freed_pages = 0;
 
 			/* free all so restart the processing */
 			if (nr_free)
 				goto restart;
 
 			/* Not allowed to fall through or break because
 			 * following context is inside spinlock while we are
 			 * outside here.
 			 */
 			goto out;
 
 		}
 	}
 
 	/* remove range of pages from the pool */
 	if (freed_pages) {
 		for (i = 0; i < freed_pages; i++)
 			TAILQ_REMOVE(&pool->list, pages_to_free[i], plinks.q);
 
 		ttm_pool_update_free_locked(pool, freed_pages);
 		nr_free -= freed_pages;
 	}
 
 	mtx_unlock(&pool->lock);
 
 	if (freed_pages)
 		ttm_pages_put(pages_to_free, freed_pages);
 out:
 	free(pages_to_free, M_TEMP);
 	return nr_free;
 }
 
 /* Get good estimation how many pages are free in pools */
 static int ttm_pool_get_num_unused_pages(void)
 {
 	unsigned i;
 	int total = 0;
 	for (i = 0; i < NUM_POOLS; ++i)
 		total += _manager->pools[i].npages;
 
 	return total;
 }
 
 /**
  * Callback for mm to request pool to reduce number of page held.
  */
 static int ttm_pool_mm_shrink(void *arg)
 {
 	static unsigned int start_pool = 0;
 	unsigned i;
 	unsigned pool_offset = atomic_fetchadd_int(&start_pool, 1);
 	struct ttm_page_pool *pool;
 	int shrink_pages = 100; /* XXXKIB */
 
 	pool_offset = pool_offset % NUM_POOLS;
 	/* select start pool in round robin fashion */
 	for (i = 0; i < NUM_POOLS; ++i) {
 		unsigned nr_free = shrink_pages;
 		if (shrink_pages == 0)
 			break;
 		pool = &_manager->pools[(i + pool_offset)%NUM_POOLS];
 		shrink_pages = ttm_page_pool_free(pool, nr_free);
 	}
 	/* return estimated number of unused pages in pool */
 	return ttm_pool_get_num_unused_pages();
 }
 
 static void ttm_pool_mm_shrink_init(struct ttm_pool_manager *manager)
 {
 
 	manager->lowmem_handler = EVENTHANDLER_REGISTER(vm_lowmem,
 	    ttm_pool_mm_shrink, manager, EVENTHANDLER_PRI_ANY);
 }
 
 static void ttm_pool_mm_shrink_fini(struct ttm_pool_manager *manager)
 {
 
 	EVENTHANDLER_DEREGISTER(vm_lowmem, manager->lowmem_handler);
 }
 
 static int ttm_set_pages_caching(vm_page_t *pages,
 		enum ttm_caching_state cstate, unsigned cpages)
 {
 	int r = 0;
 	/* Set page caching */
 	switch (cstate) {
 	case tt_uncached:
 		r = set_pages_array_uc(pages, cpages);
 		if (r)
 			printf("[TTM] Failed to set %d pages to uc!\n", cpages);
 		break;
 	case tt_wc:
 		r = set_pages_array_wc(pages, cpages);
 		if (r)
 			printf("[TTM] Failed to set %d pages to wc!\n", cpages);
 		break;
 	default:
 		break;
 	}
 	return r;
 }
 
 /**
  * Free pages the pages that failed to change the caching state. If there is
  * any pages that have changed their caching state already put them to the
  * pool.
  */
 static void ttm_handle_caching_state_failure(struct pglist *pages,
 		int ttm_flags, enum ttm_caching_state cstate,
 		vm_page_t *failed_pages, unsigned cpages)
 {
 	unsigned i;
 	/* Failed pages have to be freed */
 	for (i = 0; i < cpages; ++i) {
 		TAILQ_REMOVE(pages, failed_pages[i], plinks.q);
 		ttm_vm_page_free(failed_pages[i]);
 	}
 }
 
 /**
  * Allocate new pages with correct caching.
  *
  * This function is reentrant if caller updates count depending on number of
  * pages returned in pages array.
  */
 static int ttm_alloc_new_pages(struct pglist *pages, int ttm_alloc_flags,
 		int ttm_flags, enum ttm_caching_state cstate, unsigned count)
 {
 	vm_page_t *caching_array;
 	vm_page_t p;
 	int r = 0;
 	unsigned i, cpages;
 	unsigned max_cpages = min(count,
 			(unsigned)(PAGE_SIZE/sizeof(vm_page_t)));
 
 	/* allocate array for page caching change */
 	caching_array = malloc(max_cpages * sizeof(vm_page_t), M_TEMP,
 	    M_WAITOK | M_ZERO);
 
 	for (i = 0, cpages = 0; i < count; ++i) {
 		p = ttm_vm_page_alloc(ttm_alloc_flags, cstate);
 		if (!p) {
 			printf("[TTM] Unable to get page %u\n", i);
 
 			/* store already allocated pages in the pool after
 			 * setting the caching state */
 			if (cpages) {
 				r = ttm_set_pages_caching(caching_array,
 							  cstate, cpages);
 				if (r)
 					ttm_handle_caching_state_failure(pages,
 						ttm_flags, cstate,
 						caching_array, cpages);
 			}
 			r = -ENOMEM;
 			goto out;
 		}
 
 #ifdef CONFIG_HIGHMEM /* KIB: nop */
 		/* gfp flags of highmem page should never be dma32 so we
 		 * we should be fine in such case
 		 */
 		if (!PageHighMem(p))
 #endif
 		{
 			caching_array[cpages++] = p;
 			if (cpages == max_cpages) {
 
 				r = ttm_set_pages_caching(caching_array,
 						cstate, cpages);
 				if (r) {
 					ttm_handle_caching_state_failure(pages,
 						ttm_flags, cstate,
 						caching_array, cpages);
 					goto out;
 				}
 				cpages = 0;
 			}
 		}
 
 		TAILQ_INSERT_HEAD(pages, p, plinks.q);
 	}
 
 	if (cpages) {
 		r = ttm_set_pages_caching(caching_array, cstate, cpages);
 		if (r)
 			ttm_handle_caching_state_failure(pages,
 					ttm_flags, cstate,
 					caching_array, cpages);
 	}
 out:
 	free(caching_array, M_TEMP);
 
 	return r;
 }
 
 /**
  * Fill the given pool if there aren't enough pages and the requested number of
  * pages is small.
  */
 static void ttm_page_pool_fill_locked(struct ttm_page_pool *pool,
     int ttm_flags, enum ttm_caching_state cstate, unsigned count)
 {
 	vm_page_t p;
 	int r;
 	unsigned cpages = 0;
 	/**
 	 * Only allow one pool fill operation at a time.
 	 * If pool doesn't have enough pages for the allocation new pages are
 	 * allocated from outside of pool.
 	 */
 	if (pool->fill_lock)
 		return;
 
 	pool->fill_lock = true;
 
 	/* If allocation request is small and there are not enough
 	 * pages in a pool we fill the pool up first. */
 	if (count < _manager->options.small
 		&& count > pool->npages) {
 		struct pglist new_pages;
 		unsigned alloc_size = _manager->options.alloc_size;
 
 		/**
 		 * Can't change page caching if in irqsave context. We have to
 		 * drop the pool->lock.
 		 */
 		mtx_unlock(&pool->lock);
 
 		TAILQ_INIT(&new_pages);
 		r = ttm_alloc_new_pages(&new_pages, pool->ttm_page_alloc_flags,
 		    ttm_flags, cstate, alloc_size);
 		mtx_lock(&pool->lock);
 
 		if (!r) {
 			TAILQ_CONCAT(&pool->list, &new_pages, plinks.q);
 			++pool->nrefills;
 			pool->npages += alloc_size;
 		} else {
 			printf("[TTM] Failed to fill pool (%p)\n", pool);
 			/* If we have any pages left put them to the pool. */
 			TAILQ_FOREACH(p, &pool->list, plinks.q) {
 				++cpages;
 			}
 			TAILQ_CONCAT(&pool->list, &new_pages, plinks.q);
 			pool->npages += cpages;
 		}
 
 	}
 	pool->fill_lock = false;
 }
 
 /**
  * Cut 'count' number of pages from the pool and put them on the return list.
  *
  * @return count of pages still required to fulfill the request.
  */
 static unsigned ttm_page_pool_get_pages(struct ttm_page_pool *pool,
 					struct pglist *pages,
 					int ttm_flags,
 					enum ttm_caching_state cstate,
 					unsigned count)
 {
 	vm_page_t p;
 	unsigned i;
 
 	mtx_lock(&pool->lock);
 	ttm_page_pool_fill_locked(pool, ttm_flags, cstate, count);
 
 	if (count >= pool->npages) {
 		/* take all pages from the pool */
 		TAILQ_CONCAT(pages, &pool->list, plinks.q);
 		count -= pool->npages;
 		pool->npages = 0;
 		goto out;
 	}
 	for (i = 0; i < count; i++) {
 		p = TAILQ_FIRST(&pool->list);
 		TAILQ_REMOVE(&pool->list, p, plinks.q);
 		TAILQ_INSERT_TAIL(pages, p, plinks.q);
 	}
 	pool->npages -= count;
 	count = 0;
 out:
 	mtx_unlock(&pool->lock);
 	return count;
 }
 
 /* Put all pages in pages list to correct pool to wait for reuse */
 static void ttm_put_pages(vm_page_t *pages, unsigned npages, int flags,
 			  enum ttm_caching_state cstate)
 {
 	struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
 	unsigned i;
 
 	if (pool == NULL) {
 		/* No pool for this memory type so free the pages */
 		for (i = 0; i < npages; i++) {
 			if (pages[i]) {
 				ttm_vm_page_free(pages[i]);
 				pages[i] = NULL;
 			}
 		}
 		return;
 	}
 
 	mtx_lock(&pool->lock);
 	for (i = 0; i < npages; i++) {
 		if (pages[i]) {
 			TAILQ_INSERT_TAIL(&pool->list, pages[i], plinks.q);
 			pages[i] = NULL;
 			pool->npages++;
 		}
 	}
 	/* Check that we don't go over the pool limit */
 	npages = 0;
 	if (pool->npages > _manager->options.max_size) {
 		npages = pool->npages - _manager->options.max_size;
 		/* free at least NUM_PAGES_TO_ALLOC number of pages
 		 * to reduce calls to set_memory_wb */
 		if (npages < NUM_PAGES_TO_ALLOC)
 			npages = NUM_PAGES_TO_ALLOC;
 	}
 	mtx_unlock(&pool->lock);
 	if (npages)
 		ttm_page_pool_free(pool, npages);
 }
 
 /*
  * On success pages list will hold count number of correctly
  * cached pages.
  */
 static int ttm_get_pages(vm_page_t *pages, unsigned npages, int flags,
 			 enum ttm_caching_state cstate)
 {
 	struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
 	struct pglist plist;
 	vm_page_t p = NULL;
 	int gfp_flags;
 	unsigned count;
 	int r;
 
 	/* No pool for cached pages */
 	if (pool == NULL) {
 		for (r = 0; r < npages; ++r) {
 			p = ttm_vm_page_alloc(flags, cstate);
 			if (!p) {
 				printf("[TTM] Unable to allocate page\n");
 				return -ENOMEM;
 			}
 			pages[r] = p;
 		}
 		return 0;
 	}
 
 	/* combine zero flag to pool flags */
 	gfp_flags = flags | pool->ttm_page_alloc_flags;
 
 	/* First we take pages from the pool */
 	TAILQ_INIT(&plist);
 	npages = ttm_page_pool_get_pages(pool, &plist, flags, cstate, npages);
 	count = 0;
 	TAILQ_FOREACH(p, &plist, plinks.q) {
 		pages[count++] = p;
 	}
 
 	/* clear the pages coming from the pool if requested */
 	if (flags & TTM_PAGE_FLAG_ZERO_ALLOC) {
 		TAILQ_FOREACH(p, &plist, plinks.q) {
 			pmap_zero_page(p);
 		}
 	}
 
 	/* If pool didn't have enough pages allocate new one. */
 	if (npages > 0) {
 		/* ttm_alloc_new_pages doesn't reference pool so we can run
 		 * multiple requests in parallel.
 		 **/
 		TAILQ_INIT(&plist);
 		r = ttm_alloc_new_pages(&plist, gfp_flags, flags, cstate,
 		    npages);
 		TAILQ_FOREACH(p, &plist, plinks.q) {
 			pages[count++] = p;
 		}
 		if (r) {
 			/* If there is any pages in the list put them back to
 			 * the pool. */
 			printf("[TTM] Failed to allocate extra pages for large request\n");
 			ttm_put_pages(pages, count, flags, cstate);
 			return r;
 		}
 	}
 
 	return 0;
 }
 
 static void ttm_page_pool_init_locked(struct ttm_page_pool *pool, int flags,
 				      char *name)
 {
 	mtx_init(&pool->lock, "ttmpool", NULL, MTX_DEF);
 	pool->fill_lock = false;
 	TAILQ_INIT(&pool->list);
 	pool->npages = pool->nfrees = 0;
 	pool->ttm_page_alloc_flags = flags;
 	pool->name = name;
 }
 
 int ttm_page_alloc_init(struct ttm_mem_global *glob, unsigned max_pages)
 {
 
 	if (_manager != NULL)
 		printf("[TTM] manager != NULL\n");
 	printf("[TTM] Initializing pool allocator\n");
 
 	_manager = malloc(sizeof(*_manager), M_TTM_POOLMGR, M_WAITOK | M_ZERO);
 
 	ttm_page_pool_init_locked(&_manager->wc_pool, 0, "wc");
 	ttm_page_pool_init_locked(&_manager->uc_pool, 0, "uc");
 	ttm_page_pool_init_locked(&_manager->wc_pool_dma32,
 	    TTM_PAGE_FLAG_DMA32, "wc dma");
 	ttm_page_pool_init_locked(&_manager->uc_pool_dma32,
 	    TTM_PAGE_FLAG_DMA32, "uc dma");
 
 	_manager->options.max_size = max_pages;
 	_manager->options.small = SMALL_ALLOCATION;
 	_manager->options.alloc_size = NUM_PAGES_TO_ALLOC;
 
 	refcount_init(&_manager->kobj_ref, 1);
 	ttm_pool_mm_shrink_init(_manager);
 
 	return 0;
 }
 
 void ttm_page_alloc_fini(void)
 {
 	int i;
 
 	printf("[TTM] Finalizing pool allocator\n");
 	ttm_pool_mm_shrink_fini(_manager);
 
 	for (i = 0; i < NUM_POOLS; ++i)
 		ttm_page_pool_free(&_manager->pools[i], FREE_ALL_PAGES);
 
 	if (refcount_release(&_manager->kobj_ref))
 		ttm_pool_kobj_release(_manager);
 	_manager = NULL;
 }
 
 int ttm_pool_populate(struct ttm_tt *ttm)
 {
 	struct ttm_mem_global *mem_glob = ttm->glob->mem_glob;
 	unsigned i;
 	int ret;
 
 	if (ttm->state != tt_unpopulated)
 		return 0;
 
 	for (i = 0; i < ttm->num_pages; ++i) {
 		ret = ttm_get_pages(&ttm->pages[i], 1,
 				    ttm->page_flags,
 				    ttm->caching_state);
 		if (ret != 0) {
 			ttm_pool_unpopulate(ttm);
 			return -ENOMEM;
 		}
 
 		ret = ttm_mem_global_alloc_page(mem_glob, ttm->pages[i],
 						false, false);
 		if (unlikely(ret != 0)) {
 			ttm_pool_unpopulate(ttm);
 			return -ENOMEM;
 		}
 	}
 
 	if (unlikely(ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)) {
 		ret = ttm_tt_swapin(ttm);
 		if (unlikely(ret != 0)) {
 			ttm_pool_unpopulate(ttm);
 			return ret;
 		}
 	}
 
 	ttm->state = tt_unbound;
 	return 0;
 }
 
 void ttm_pool_unpopulate(struct ttm_tt *ttm)
 {
 	unsigned i;
 
 	for (i = 0; i < ttm->num_pages; ++i) {
 		if (ttm->pages[i]) {
 			ttm_mem_global_free_page(ttm->glob->mem_glob,
 						 ttm->pages[i]);
 			ttm_put_pages(&ttm->pages[i], 1,
 				      ttm->page_flags,
 				      ttm->caching_state);
 		}
 	}
 	ttm->state = tt_unpopulated;
 }
 
 #if 0
 /* XXXKIB sysctl */
 int ttm_page_alloc_debugfs(struct seq_file *m, void *data)
 {
 	struct ttm_page_pool *p;
 	unsigned i;
 	char *h[] = {"pool", "refills", "pages freed", "size"};
 	if (!_manager) {
 		seq_printf(m, "No pool allocator running.\n");
 		return 0;
 	}
 	seq_printf(m, "%6s %12s %13s %8s\n",
 			h[0], h[1], h[2], h[3]);
 	for (i = 0; i < NUM_POOLS; ++i) {
 		p = &_manager->pools[i];
 
 		seq_printf(m, "%6s %12ld %13ld %8d\n",
 				p->name, p->nrefills,
 				p->nfrees, p->npages);
 	}
 	return 0;
 }
 #endif
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index f6179592fb11..1d3321fd4ff6 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -1,2719 +1,2718 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2014-2019 Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/domainset.h>
 #include <sys/endian.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/rmlock.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/kthread.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 #include <machine/pcb.h>
 #endif
 #include <machine/vmparam.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #ifdef RSS
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #endif
 #include <net/route.h>
 #include <net/route/nhop.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #endif
 #include <netinet/tcp_var.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/ktls.h>
 #include <vm/uma_dbg.h>
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pagequeue.h>
 
 struct ktls_wq {
 	struct mtx	mtx;
 	STAILQ_HEAD(, mbuf) m_head;
 	STAILQ_HEAD(, socket) so_head;
 	bool		running;
 	int		lastallocfail;
 } __aligned(CACHE_LINE_SIZE);
 
 struct ktls_alloc_thread {
 	uint64_t wakeups;
 	uint64_t allocs;
 	struct thread *td;
 	int running;
 };
 
 struct ktls_domain_info {
 	int count;
 	int cpu[MAXCPU];
 	struct ktls_alloc_thread alloc_td;
 };
 
 struct ktls_domain_info ktls_domains[MAXMEMDOM];
 static struct ktls_wq *ktls_wq;
 static struct proc *ktls_proc;
 static uma_zone_t ktls_session_zone;
 static uma_zone_t ktls_buffer_zone;
 static uint16_t ktls_cpuid_lookup[MAXCPU];
 static int ktls_init_state;
 static struct sx ktls_init_lock;
 SX_SYSINIT(ktls_init_lock, &ktls_init_lock, "ktls init");
 
 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload stats");
 
 #ifdef RSS
 static int ktls_bind_threads = 1;
 #else
 static int ktls_bind_threads;
 #endif
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
     &ktls_bind_threads, 0,
     "Bind crypto threads to cores (1) or cores and domains (2) at boot");
 
 static u_int ktls_maxlen = 16384;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN,
     &ktls_maxlen, 0, "Maximum TLS record size");
 
 static int ktls_number_threads;
 SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
     &ktls_number_threads, 0,
     "Number of TLS threads in thread-pool");
 
 unsigned int ktls_ifnet_max_rexmit_pct = 2;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
     &ktls_ifnet_max_rexmit_pct, 2,
     "Max percent bytes retransmitted before ifnet TLS is disabled");
 
 static bool ktls_offload_enable;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
     &ktls_offload_enable, 0,
     "Enable support for kernel TLS offload");
 
 static bool ktls_cbc_enable = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN,
     &ktls_cbc_enable, 1,
     "Enable Support of AES-CBC crypto for kernel TLS");
 
 static bool ktls_sw_buffer_cache = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN,
     &ktls_sw_buffer_cache, 1,
     "Enable caching of output buffers for SW encryption");
 
 static int ktls_max_alloc = 128;
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, max_alloc, CTLFLAG_RWTUN,
     &ktls_max_alloc, 128,
     "Max number of 16k buffers to allocate in thread context");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_pending);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_pending, CTLFLAG_RD,
     &ktls_cnt_tx_pending,
     "Number of TLS 1.0 records waiting for earlier TLS records");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
     &ktls_cnt_tx_queued,
     "Number of TLS records in queue to tasks for SW encryption");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD,
     &ktls_cnt_rx_queued,
     "Number of TLS sockets in queue to tasks for SW decryption");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_total);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
     CTLFLAG_RD, &ktls_offload_total,
     "Total successful TLS setups (parameters set)");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
     CTLFLAG_RD, &ktls_offload_enable_calls,
     "Total number of TLS enable calls made");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
     &ktls_offload_active, "Total Active TLS sessions");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD,
     &ktls_offload_corrupted_records, "Total corrupted TLS records received");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
     &ktls_offload_failed_crypto, "Total TLS crypto failures");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
     &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
     &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD,
     &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD,
     &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet");
 
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Software TLS session stats");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Hardware (ifnet) TLS session stats");
 #ifdef TCP_OFFLOAD
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "TOE TLS session stats");
 #endif
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
     "Active number of software TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
     "Active number of software TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_sw_chacha20,
     "Active number of software TLS sessions using Chacha20-Poly1305");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_ifnet_cbc,
     "Active number of ifnet TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_ifnet_gcm,
     "Active number of ifnet TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_ifnet_chacha20,
     "Active number of ifnet TLS sessions using Chacha20-Poly1305");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
     &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
     &ktls_ifnet_reset_dropped,
     "TLS sessions dropped after failing to update ifnet send tag");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
     &ktls_ifnet_reset_failed,
     "TLS sessions that failed to allocate a new ifnet send tag");
 
 static int ktls_ifnet_permitted;
 SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
     &ktls_ifnet_permitted, 1,
     "Whether to permit hardware (ifnet) TLS sessions");
 
 #ifdef TCP_OFFLOAD
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_toe_cbc,
     "Active number of TOE TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_toe_gcm,
     "Active number of TOE TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_toe_chacha20,
     "Active number of TOE TLS sessions using Chacha20-Poly1305");
 #endif
 
 static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
 
 static void ktls_cleanup(struct ktls_session *tls);
 #if defined(INET) || defined(INET6)
 static void ktls_reset_send_tag(void *context, int pending);
 #endif
 static void ktls_work_thread(void *ctx);
 static void ktls_alloc_thread(void *ctx);
 
 #if defined(INET) || defined(INET6)
 static u_int
 ktls_get_cpu(struct socket *so)
 {
 	struct inpcb *inp;
 #ifdef NUMA
 	struct ktls_domain_info *di;
 #endif
 	u_int cpuid;
 
 	inp = sotoinpcb(so);
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid != NETISR_CPUID_NONE)
 		return (cpuid);
 #endif
 	/*
 	 * Just use the flowid to shard connections in a repeatable
 	 * fashion.  Note that TLS 1.0 sessions rely on the
 	 * serialization provided by having the same connection use
 	 * the same queue.
 	 */
 #ifdef NUMA
 	if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) {
 		di = &ktls_domains[inp->inp_numa_domain];
 		cpuid = di->cpu[inp->inp_flowid % di->count];
 	} else
 #endif
 		cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
 	return (cpuid);
 }
 #endif
 
 static int
 ktls_buffer_import(void *arg, void **store, int count, int domain, int flags)
 {
 	vm_page_t m;
-	int i;
+	int i, req;
 
 	KASSERT((ktls_maxlen & PAGE_MASK) == 0,
 	    ("%s: ktls max length %d is not page size-aligned",
 	    __func__, ktls_maxlen));
 
+	req = VM_ALLOC_WIRED | VM_ALLOC_NODUMP | malloc2vm_flags(flags);
 	for (i = 0; i < count; i++) {
-		m = vm_page_alloc_contig_domain(NULL, 0, domain,
-		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
-		    VM_ALLOC_NODUMP | malloc2vm_flags(flags),
+		m = vm_page_alloc_noobj_contig_domain(domain, req,
 		    atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0,
 		    VM_MEMATTR_DEFAULT);
 		if (m == NULL)
 			break;
 		store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	}
 	return (i);
 }
 
 static void
 ktls_buffer_release(void *arg __unused, void **store, int count)
 {
 	vm_page_t m;
 	int i, j;
 
 	for (i = 0; i < count; i++) {
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
 		for (j = 0; j < atop(ktls_maxlen); j++) {
 			(void)vm_page_unwire_noq(m + j);
 			vm_page_free(m + j);
 		}
 	}
 }
 
 static void
 ktls_free_mext_contig(struct mbuf *m)
 {
 	M_ASSERTEXTPG(m);
 	uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0]));
 }
 
 static int
 ktls_init(void)
 {
 	struct thread *td;
 	struct pcpu *pc;
 	int count, domain, error, i;
 
 	ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
 	    M_WAITOK | M_ZERO);
 
 	ktls_session_zone = uma_zcreate("ktls_session",
 	    sizeof(struct ktls_session),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 
 	if (ktls_sw_buffer_cache) {
 		ktls_buffer_zone = uma_zcache_create("ktls_buffers",
 		    roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL,
 		    ktls_buffer_import, ktls_buffer_release, NULL,
 		    UMA_ZONE_FIRSTTOUCH);
 	}
 
 	/*
 	 * Initialize the workqueues to run the TLS work.  We create a
 	 * work queue for each CPU.
 	 */
 	CPU_FOREACH(i) {
 		STAILQ_INIT(&ktls_wq[i].m_head);
 		STAILQ_INIT(&ktls_wq[i].so_head);
 		mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
 		if (ktls_bind_threads > 1) {
 			pc = pcpu_find(i);
 			domain = pc->pc_domain;
 			count = ktls_domains[domain].count;
 			ktls_domains[domain].cpu[count] = i;
 			ktls_domains[domain].count++;
 		}
 		ktls_cpuid_lookup[ktls_number_threads] = i;
 		ktls_number_threads++;
 	}
 
 	/*
 	 * If we somehow have an empty domain, fall back to choosing
 	 * among all KTLS threads.
 	 */
 	if (ktls_bind_threads > 1) {
 		for (i = 0; i < vm_ndomains; i++) {
 			if (ktls_domains[i].count == 0) {
 				ktls_bind_threads = 1;
 				break;
 			}
 		}
 	}
 
 	/* Start kthreads for each workqueue. */
 	CPU_FOREACH(i) {
 		error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
 		    &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
 		if (error) {
 			printf("Can't add KTLS thread %d error %d\n", i, error);
 			return (error);
 		}
 	}
 
 	/*
 	 * Start an allocation thread per-domain to perform blocking allocations
 	 * of 16k physically contiguous TLS crypto destination buffers.
 	 */
 	if (ktls_sw_buffer_cache) {
 		for (domain = 0; domain < vm_ndomains; domain++) {
 			if (VM_DOMAIN_EMPTY(domain))
 				continue;
 			if (CPU_EMPTY(&cpuset_domain[domain]))
 				continue;
 			error = kproc_kthread_add(ktls_alloc_thread,
 			    &ktls_domains[domain], &ktls_proc,
 			    &ktls_domains[domain].alloc_td.td,
 			    0, 0, "KTLS", "alloc_%d", domain);
 			if (error) {
 				printf("Can't add KTLS alloc thread %d error %d\n",
 				    domain, error);
 				return (error);
 			}
 		}
 	}
 
 	if (bootverbose)
 		printf("KTLS: Initialized %d threads\n", ktls_number_threads);
 	return (0);
 }
 
 static int
 ktls_start_kthreads(void)
 {
 	int error, state;
 
 start:
 	state = atomic_load_acq_int(&ktls_init_state);
 	if (__predict_true(state > 0))
 		return (0);
 	if (state < 0)
 		return (ENXIO);
 
 	sx_xlock(&ktls_init_lock);
 	if (ktls_init_state != 0) {
 		sx_xunlock(&ktls_init_lock);
 		goto start;
 	}
 
 	error = ktls_init();
 	if (error == 0)
 		state = 1;
 	else
 		state = -1;
 	atomic_store_rel_int(&ktls_init_state, state);
 	sx_xunlock(&ktls_init_lock);
 	return (error);
 }
 
 #if defined(INET) || defined(INET6)
 static int
 ktls_create_session(struct socket *so, struct tls_enable *en,
     struct ktls_session **tlsp)
 {
 	struct ktls_session *tls;
 	int error;
 
 	/* Only TLS 1.0 - 1.3 are supported. */
 	if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
 		return (EINVAL);
 	if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
 	    en->tls_vminor > TLS_MINOR_VER_THREE)
 		return (EINVAL);
 
 	if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
 		return (EINVAL);
 
 	/* All supported algorithms require a cipher key. */
 	if (en->cipher_key_len == 0)
 		return (EINVAL);
 
 	/* No flags are currently supported. */
 	if (en->flags != 0)
 		return (EINVAL);
 
 	/* Common checks for supported algorithms. */
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * auth_algorithm isn't used, but permit GMAC values
 		 * for compatibility.
 		 */
 		switch (en->auth_algorithm) {
 		case 0:
 #ifdef COMPAT_FREEBSD12
 		/* XXX: Really 13.0-current COMPAT. */
 		case CRYPTO_AES_128_NIST_GMAC:
 		case CRYPTO_AES_192_NIST_GMAC:
 		case CRYPTO_AES_256_NIST_GMAC:
 #endif
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len != 0)
 			return (EINVAL);
 		if ((en->tls_vminor == TLS_MINOR_VER_TWO &&
 			en->iv_len != TLS_AEAD_GCM_LEN) ||
 		    (en->tls_vminor == TLS_MINOR_VER_THREE &&
 			en->iv_len != TLS_1_3_GCM_IV_LEN))
 			return (EINVAL);
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			/*
 			 * TLS 1.0 requires an implicit IV.  TLS 1.1+
 			 * all use explicit IVs.
 			 */
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
 					return (EINVAL);
 				break;
 			}
 
 			/* FALLTHROUGH */
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			/* Ignore any supplied IV. */
 			en->iv_len = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len == 0)
 			return (EINVAL);
 		if (en->tls_vminor != TLS_MINOR_VER_ZERO &&
 		    en->tls_vminor != TLS_MINOR_VER_ONE &&
 		    en->tls_vminor != TLS_MINOR_VER_TWO)
 			return (EINVAL);
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		if (en->auth_algorithm != 0 || en->auth_key_len != 0)
 			return (EINVAL);
 		if (en->tls_vminor != TLS_MINOR_VER_TWO &&
 		    en->tls_vminor != TLS_MINOR_VER_THREE)
 			return (EINVAL);
 		if (en->iv_len != TLS_CHACHA20_IV_LEN)
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	error = ktls_start_kthreads();
 	if (error != 0)
 		return (error);
 
 	tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls->refcount, 1);
 	TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
 
 	tls->wq_index = ktls_get_cpu(so);
 
 	tls->params.cipher_algorithm = en->cipher_algorithm;
 	tls->params.auth_algorithm = en->auth_algorithm;
 	tls->params.tls_vmajor = en->tls_vmajor;
 	tls->params.tls_vminor = en->tls_vminor;
 	tls->params.flags = en->flags;
 	tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
 
 	/* Set the header and trailer lengths. */
 	tls->params.tls_hlen = sizeof(struct tls_record_layer);
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
 		 * nonce.  TLS 1.3 uses a 12 byte implicit IV.
 		 */
 		if (en->tls_vminor < TLS_MINOR_VER_THREE)
 			tls->params.tls_hlen += sizeof(uint64_t);
 		tls->params.tls_tlen = AES_GMAC_HASH_LEN;
 		tls->params.tls_bs = 1;
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				/* Implicit IV, no nonce. */
 				tls->sequential_records = true;
 				tls->next_seqno = be64dec(en->rec_seq);
 				STAILQ_INIT(&tls->pending_records);
 			} else {
 				tls->params.tls_hlen += AES_BLOCK_LEN;
 			}
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA1_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_256_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_384_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_384_HASH_LEN;
 			break;
 		default:
 			panic("invalid hmac");
 		}
 		tls->params.tls_bs = AES_BLOCK_LEN;
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		/*
 		 * Chacha20 uses a 12 byte implicit IV.
 		 */
 		tls->params.tls_tlen = POLY1305_HASH_LEN;
 		tls->params.tls_bs = 1;
 		break;
 	default:
 		panic("invalid cipher");
 	}
 
 	/*
 	 * TLS 1.3 includes optional padding which we do not support,
 	 * and also puts the "real" record type at the end of the
 	 * encrypted data.
 	 */
 	if (en->tls_vminor == TLS_MINOR_VER_THREE)
 		tls->params.tls_tlen += sizeof(uint8_t);
 
 	KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
 	    ("TLS header length too long: %d", tls->params.tls_hlen));
 	KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
 	    ("TLS trailer length too long: %d", tls->params.tls_tlen));
 
 	if (en->auth_key_len != 0) {
 		tls->params.auth_key_len = en->auth_key_len;
 		tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
 		    M_WAITOK);
 		error = copyin(en->auth_key, tls->params.auth_key,
 		    en->auth_key_len);
 		if (error)
 			goto out;
 	}
 
 	tls->params.cipher_key_len = en->cipher_key_len;
 	tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
 	error = copyin(en->cipher_key, tls->params.cipher_key,
 	    en->cipher_key_len);
 	if (error)
 		goto out;
 
 	/*
 	 * This holds the implicit portion of the nonce for AEAD
 	 * ciphers and the initial implicit IV for TLS 1.0.  The
 	 * explicit portions of the IV are generated in ktls_frame().
 	 */
 	if (en->iv_len != 0) {
 		tls->params.iv_len = en->iv_len;
 		error = copyin(en->iv, tls->params.iv, en->iv_len);
 		if (error)
 			goto out;
 
 		/*
 		 * For TLS 1.2 with GCM, generate an 8-byte nonce as a
 		 * counter to generate unique explicit IVs.
 		 *
 		 * Store this counter in the last 8 bytes of the IV
 		 * array so that it is 8-byte aligned.
 		 */
 		if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    en->tls_vminor == TLS_MINOR_VER_TWO)
 			arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0);
 	}
 
 	*tlsp = tls;
 	return (0);
 
 out:
 	ktls_cleanup(tls);
 	return (error);
 }
 
 static struct ktls_session *
 ktls_clone_session(struct ktls_session *tls)
 {
 	struct ktls_session *tls_new;
 
 	tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls_new->refcount, 1);
 	TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_send_tag, tls_new);
 
 	/* Copy fields from existing session. */
 	tls_new->params = tls->params;
 	tls_new->wq_index = tls->wq_index;
 
 	/* Deep copy keys. */
 	if (tls_new->params.auth_key != NULL) {
 		tls_new->params.auth_key = malloc(tls->params.auth_key_len,
 		    M_KTLS, M_WAITOK);
 		memcpy(tls_new->params.auth_key, tls->params.auth_key,
 		    tls->params.auth_key_len);
 	}
 
 	tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
 	    M_WAITOK);
 	memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
 	    tls->params.cipher_key_len);
 
 	return (tls_new);
 }
 #endif
 
 static void
 ktls_cleanup(struct ktls_session *tls)
 {
 
 	counter_u64_add(ktls_offload_active, -1);
 	switch (tls->mode) {
 	case TCP_TLS_MODE_SW:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_sw_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_sw_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_sw_chacha20, -1);
 			break;
 		}
 		ktls_ocf_free(tls);
 		break;
 	case TCP_TLS_MODE_IFNET:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_ifnet_chacha20, -1);
 			break;
 		}
 		if (tls->snd_tag != NULL)
 			m_snd_tag_rele(tls->snd_tag);
 		break;
 #ifdef TCP_OFFLOAD
 	case TCP_TLS_MODE_TOE:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_toe_chacha20, -1);
 			break;
 		}
 		break;
 #endif
 	}
 	if (tls->params.auth_key != NULL) {
 		zfree(tls->params.auth_key, M_KTLS);
 		tls->params.auth_key = NULL;
 		tls->params.auth_key_len = 0;
 	}
 	if (tls->params.cipher_key != NULL) {
 		zfree(tls->params.cipher_key, M_KTLS);
 		tls->params.cipher_key = NULL;
 		tls->params.cipher_key_len = 0;
 	}
 	explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
 }
 
 #if defined(INET) || defined(INET6)
 
 #ifdef TCP_OFFLOAD
 static int
 ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (!(tp->t_flags & TF_TOE)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = tcp_offload_alloc_tls_session(tp, tls, direction);
 	INP_WUNLOCK(inp);
 	if (error == 0) {
 		tls->mode = TCP_TLS_MODE_TOE;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, 1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_toe_chacha20, 1);
 			break;
 		}
 	}
 	return (error);
 }
 #endif
 
 /*
  * Common code used when first enabling ifnet TLS on a connection or
  * when allocating a new ifnet TLS session due to a routing change.
  * This function allocates a new TLS send tag on whatever interface
  * the connection is currently routed over.
  */
 static int
 ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
     struct m_snd_tag **mstp)
 {
 	union if_snd_tag_alloc_params params;
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 	struct tcpcb *tp;
 	int error;
 
 	INP_RLOCK(inp);
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 
 	/*
 	 * Check administrative controls on ifnet TLS to determine if
 	 * ifnet TLS should be denied.
 	 *
 	 * - Always permit 'force' requests.
 	 * - ktls_ifnet_permitted == 0: always deny.
 	 */
 	if (!force && ktls_ifnet_permitted == 0) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 
 	/*
 	 * XXX: Use the cached route in the inpcb to find the
 	 * interface.  This should perhaps instead use
 	 * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
 	 * enabled after a connection has completed key negotiation in
 	 * userland, the cached route will be present in practice.
 	 */
 	nh = inp->inp_route.ro_nh;
 	if (nh == NULL) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 	ifp = nh->nh_ifp;
 	if_ref(ifp);
 
 	/*
 	 * Allocate a TLS + ratelimit tag if the connection has an
 	 * existing pacing rate.
 	 */
 	if (tp->t_pacing_rate != -1 &&
 	    (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) {
 		params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT;
 		params.tls_rate_limit.inp = inp;
 		params.tls_rate_limit.tls = tls;
 		params.tls_rate_limit.max_rate = tp->t_pacing_rate;
 	} else {
 		params.hdr.type = IF_SND_TAG_TYPE_TLS;
 		params.tls.inp = inp;
 		params.tls.tls = tls;
 	}
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.hdr.numa_domain = inp->inp_numa_domain;
 	INP_RUNLOCK(inp);
 
 	if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	} else {
 		if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 	error = m_snd_tag_alloc(ifp, &params, mstp);
 out:
 	if_rele(ifp);
 	return (error);
 }
 
 static int
 ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force)
 {
 	struct m_snd_tag *mst;
 	int error;
 
 	error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
 	if (error == 0) {
 		tls->mode = TCP_TLS_MODE_IFNET;
 		tls->snd_tag = mst;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, 1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_ifnet_chacha20, 1);
 			break;
 		}
 	}
 	return (error);
 }
 
 static int
 ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction)
 {
 	int error;
 
 	error = ktls_ocf_try(so, tls, direction);
 	if (error)
 		return (error);
 	tls->mode = TCP_TLS_MODE_SW;
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		counter_u64_add(ktls_sw_cbc, 1);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		counter_u64_add(ktls_sw_gcm, 1);
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		counter_u64_add(ktls_sw_chacha20, 1);
 		break;
 	}
 	return (0);
 }
 
 /*
  * KTLS RX stores data in the socket buffer as a list of TLS records,
  * where each record is stored as a control message containg the TLS
  * header followed by data mbufs containing the decrypted data.  This
  * is different from KTLS TX which always uses an mb_ext_pgs mbuf for
  * both encrypted and decrypted data.  TLS records decrypted by a NIC
  * should be queued to the socket buffer as records, but encrypted
  * data which needs to be decrypted by software arrives as a stream of
  * regular mbufs which need to be converted.  In addition, there may
  * already be pending encrypted data in the socket buffer when KTLS RX
  * is enabled.
  *
  * To manage not-yet-decrypted data for KTLS RX, the following scheme
  * is used:
  *
  * - A single chain of NOTREADY mbufs is hung off of sb_mtls.
  *
  * - ktls_check_rx checks this chain of mbufs reading the TLS header
  *   from the first mbuf.  Once all of the data for that TLS record is
  *   queued, the socket is queued to a worker thread.
  *
  * - The worker thread calls ktls_decrypt to decrypt TLS records in
  *   the TLS chain.  Each TLS record is detached from the TLS chain,
  *   decrypted, and inserted into the regular socket buffer chain as
  *   record starting with a control message holding the TLS header and
  *   a chain of mbufs holding the encrypted data.
  */
 
 static void
 sb_mark_notready(struct sockbuf *sb)
 {
 	struct mbuf *m;
 
 	m = sb->sb_mb;
 	sb->sb_mtls = m;
 	sb->sb_mb = NULL;
 	sb->sb_mbtail = NULL;
 	sb->sb_lastrecord = NULL;
 	for (; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
 		    __func__));
 		KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
 		    __func__));
 		KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
 		    __func__));
 		m->m_flags |= M_NOTREADY;
 		sb->sb_acc -= m->m_len;
 		sb->sb_tlscc += m->m_len;
 		sb->sb_mtlstail = m;
 	}
 	KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc,
 	    ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc,
 	    sb->sb_ccc));
 }
 
 int
 ktls_enable_rx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 	if (SOLISTENING(so))
 		return (EINVAL);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_rcv.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	/* TLS 1.3 is not yet supported. */
 	if (en->tls_vmajor == TLS_MAJOR_VER_ONE &&
 	    en->tls_vminor == TLS_MINOR_VER_THREE)
 		return (ENOTSUP);
 
 	error = ktls_create_session(so, en, &tls);
 	if (error)
 		return (error);
 
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_RX);
 	if (error)
 #endif
 		error = ktls_try_sw(so, tls, KTLS_RX);
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	/* Mark the socket as using TLS offload. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_rcv.sb_tls_info = tls;
 	so->so_rcv.sb_flags |= SB_TLS_RX;
 
 	/* Mark existing data as not ready until it can be decrypted. */
 	if (tls->mode != TCP_TLS_MODE_TOE) {
 		sb_mark_notready(&so->so_rcv);
 		ktls_check_rx(&so->so_rcv);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_enable_tx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 	if (SOLISTENING(so))
 		return (EINVAL);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_snd.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	/* TLS requires ext pgs */
 	if (mb_use_ext_pgs == 0)
 		return (ENXIO);
 
 	error = ktls_create_session(so, en, &tls);
 	if (error)
 		return (error);
 
 	/* Prefer TOE -> ifnet TLS -> software TLS. */
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_TX);
 	if (error)
 #endif
 		error = ktls_try_ifnet(so, tls, false);
 	if (error)
 		error = ktls_try_sw(so, tls, KTLS_TX);
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	/*
 	 * Write lock the INP when setting sb_tls_info so that
 	 * routines in tcp_ratelimit.c can read sb_tls_info while
 	 * holding the INP lock.
 	 */
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_snd.sb_tls_info = tls;
 	if (tls->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 	SOCK_IO_SEND_UNLOCK(so);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_get_rx_mode(struct socket *so, int *modep)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCK_RECVBUF_LOCK(so);
 	tls = so->so_rcv.sb_tls_info;
 	if (tls == NULL)
 		*modep = TCP_TLS_MODE_NONE;
 	else
 		*modep = tls->mode;
 	SOCK_RECVBUF_UNLOCK(so);
 	return (0);
 }
 
 int
 ktls_get_tx_mode(struct socket *so, int *modep)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCK_SENDBUF_LOCK(so);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL)
 		*modep = TCP_TLS_MODE_NONE;
 	else
 		*modep = tls->mode;
 	SOCK_SENDBUF_UNLOCK(so);
 	return (0);
 }
 
 /*
  * Switch between SW and ifnet TLS sessions as requested.
  */
 int
 ktls_set_tx_mode(struct socket *so, int mode)
 {
 	struct ktls_session *tls, *tls_new;
 	struct inpcb *inp;
 	int error;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	switch (mode) {
 	case TCP_TLS_MODE_SW:
 	case TCP_TLS_MODE_IFNET:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	if (tls->mode == mode) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	tls = ktls_hold(tls);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 
 	tls_new = ktls_clone_session(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		error = ktls_try_ifnet(so, tls_new, true);
 	else
 		error = ktls_try_sw(so, tls_new, KTLS_TX);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	/*
 	 * If we raced with another session change, keep the existing
 	 * session.
 	 */
 	if (tls != so->so_snd.sb_tls_info) {
 		counter_u64_add(ktls_switch_failed, 1);
 		SOCK_IO_SEND_UNLOCK(so);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (EBUSY);
 	}
 
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_info = tls_new;
 	if (tls_new->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCK_IO_SEND_UNLOCK(so);
 
 	/*
 	 * Drop two references on 'tls'.  The first is for the
 	 * ktls_hold() above.  The second drops the reference from the
 	 * socket buffer.
 	 */
 	KASSERT(tls->refcount >= 2, ("too few references on old session"));
 	ktls_free(tls);
 	ktls_free(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		counter_u64_add(ktls_switch_to_ifnet, 1);
 	else
 		counter_u64_add(ktls_switch_to_sw, 1);
 
 	INP_WLOCK(inp);
 	return (0);
 }
 
 /*
  * Try to allocate a new TLS send tag.  This task is scheduled when
  * ip_output detects a route change while trying to transmit a packet
  * holding a TLS record.  If a new tag is allocated, replace the tag
  * in the TLS session.  Subsequent packets on the connection will use
  * the new tag.  If a new tag cannot be allocated, drop the
  * connection.
  */
 static void
 ktls_reset_send_tag(void *context, int pending)
 {
 	struct epoch_tracker et;
 	struct ktls_session *tls;
 	struct m_snd_tag *old, *new;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	MPASS(pending == 1);
 
 	tls = context;
 	inp = tls->inp;
 
 	/*
 	 * Free the old tag first before allocating a new one.
 	 * ip[6]_output_send() will treat a NULL send tag the same as
 	 * an ifp mismatch and drop packets until a new tag is
 	 * allocated.
 	 *
 	 * Write-lock the INP when changing tls->snd_tag since
 	 * ip[6]_output_send() holds a read-lock when reading the
 	 * pointer.
 	 */
 	INP_WLOCK(inp);
 	old = tls->snd_tag;
 	tls->snd_tag = NULL;
 	INP_WUNLOCK(inp);
 	if (old != NULL)
 		m_snd_tag_rele(old);
 
 	error = ktls_alloc_snd_tag(inp, tls, true, &new);
 
 	if (error == 0) {
 		INP_WLOCK(inp);
 		tls->snd_tag = new;
 		mtx_pool_lock(mtxpool_sleep, tls);
 		tls->reset_pending = false;
 		mtx_pool_unlock(mtxpool_sleep, tls);
 		if (!in_pcbrele_wlocked(inp))
 			INP_WUNLOCK(inp);
 
 		counter_u64_add(ktls_ifnet_reset, 1);
 
 		/*
 		 * XXX: Should we kick tcp_output explicitly now that
 		 * the send tag is fixed or just rely on timers?
 		 */
 	} else {
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		if (!in_pcbrele_wlocked(inp)) {
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    !(inp->inp_flags & INP_DROPPED)) {
 				tp = intotcpcb(inp);
 				CURVNET_SET(tp->t_vnet);
 				tp = tcp_drop(tp, ECONNABORTED);
 				CURVNET_RESTORE();
 				if (tp != NULL)
 					INP_WUNLOCK(inp);
 				counter_u64_add(ktls_ifnet_reset_dropped, 1);
 			} else
 				INP_WUNLOCK(inp);
 		}
 		NET_EPOCH_EXIT(et);
 
 		counter_u64_add(ktls_ifnet_reset_failed, 1);
 
 		/*
 		 * Leave reset_pending true to avoid future tasks while
 		 * the socket goes away.
 		 */
 	}
 
 	ktls_free(tls);
 }
 
 int
 ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
 {
 
 	if (inp == NULL)
 		return (ENOBUFS);
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * See if we should schedule a task to update the send tag for
 	 * this session.
 	 */
 	mtx_pool_lock(mtxpool_sleep, tls);
 	if (!tls->reset_pending) {
 		(void) ktls_hold(tls);
 		in_pcbref(inp);
 		tls->inp = inp;
 		tls->reset_pending = true;
 		taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 	}
 	mtx_pool_unlock(mtxpool_sleep, tls);
 	return (ENOBUFS);
 }
 
 #ifdef RATELIMIT
 int
 ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate)
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	struct m_snd_tag *mst;
 
 	/* Can't get to the inp, but it should be locked. */
 	/* INP_LOCK_ASSERT(inp); */
 
 	MPASS(tls->mode == TCP_TLS_MODE_IFNET);
 
 	if (tls->snd_tag == NULL) {
 		/*
 		 * Resetting send tag, ignore this change.  The
 		 * pending reset may or may not see this updated rate
 		 * in the tcpcb.  If it doesn't, we will just lose
 		 * this rate change.
 		 */
 		return (0);
 	}
 
 	MPASS(tls->snd_tag != NULL);
 	MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT);
 
 	mst = tls->snd_tag;
 	return (mst->sw->snd_tag_modify(mst, &params));
 }
 #endif
 #endif
 
 void
 ktls_destroy(struct ktls_session *tls)
 {
 
 	if (tls->sequential_records) {
 		struct mbuf *m, *n;
 		int page_count;
 
 		STAILQ_FOREACH_SAFE(m, &tls->pending_records, m_epg_stailq, n) {
 			page_count = m->m_epg_enc_cnt;
 			while (page_count > 0) {
 				KASSERT(page_count >= m->m_epg_nrdy,
 				    ("%s: too few pages", __func__));
 				page_count -= m->m_epg_nrdy;
 				m = m_free(m);
 			}
 		}
 	}
 	ktls_cleanup(tls);
 	uma_zfree(ktls_session_zone, tls);
 }
 
 void
 ktls_seq(struct sockbuf *sb, struct mbuf *m)
 {
 
 	for (; m != NULL; m = m->m_next) {
 		KASSERT((m->m_flags & M_EXTPG) != 0,
 		    ("ktls_seq: mapped mbuf %p", m));
 
 		m->m_epg_seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 	}
 }
 
 /*
  * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
  * mbuf in the chain must be an unmapped mbuf.  The payload of the
  * mbuf must be populated with the payload of each TLS record.
  *
  * The record_type argument specifies the TLS record type used when
  * populating the TLS header.
  *
  * The enq_count argument on return is set to the number of pages of
  * payload data for this entire chain that need to be encrypted via SW
  * encryption.  The returned value should be passed to ktls_enqueue
  * when scheduling encryption of this chain of mbufs.  To handle the
  * special case of empty fragments for TLS 1.0 sessions, an empty
  * fragment counts as one page.
  */
 void
 ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
     uint8_t record_type)
 {
 	struct tls_record_layer *tlshdr;
 	struct mbuf *m;
 	uint64_t *noncep;
 	uint16_t tls_len;
 	int maxlen;
 
 	maxlen = tls->params.max_frame_len;
 	*enq_cnt = 0;
 	for (m = top; m != NULL; m = m->m_next) {
 		/*
 		 * All mbufs in the chain should be TLS records whose
 		 * payload does not exceed the maximum frame length.
 		 *
 		 * Empty TLS records are permitted when using CBC.
 		 */
 		KASSERT(m->m_len <= maxlen &&
 		    (tls->params.cipher_algorithm == CRYPTO_AES_CBC ?
 		    m->m_len >= 0 : m->m_len > 0),
 		    ("ktls_frame: m %p len %d\n", m, m->m_len));
 
 		/*
 		 * TLS frames require unmapped mbufs to store session
 		 * info.
 		 */
 		KASSERT((m->m_flags & M_EXTPG) != 0,
 		    ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top));
 
 		tls_len = m->m_len;
 
 		/* Save a reference to the session. */
 		m->m_epg_tls = ktls_hold(tls);
 
 		m->m_epg_hdrlen = tls->params.tls_hlen;
 		m->m_epg_trllen = tls->params.tls_tlen;
 		if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
 			int bs, delta;
 
 			/*
 			 * AES-CBC pads messages to a multiple of the
 			 * block size.  Note that the padding is
 			 * applied after the digest and the encryption
 			 * is done on the "plaintext || mac || padding".
 			 * At least one byte of padding is always
 			 * present.
 			 *
 			 * Compute the final trailer length assuming
 			 * at most one block of padding.
 			 * tls->params.tls_tlen is the maximum
 			 * possible trailer length (padding + digest).
 			 * delta holds the number of excess padding
 			 * bytes if the maximum were used.  Those
 			 * extra bytes are removed.
 			 */
 			bs = tls->params.tls_bs;
 			delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
 			m->m_epg_trllen -= delta;
 		}
 		m->m_len += m->m_epg_hdrlen + m->m_epg_trllen;
 
 		/* Populate the TLS header. */
 		tlshdr = (void *)m->m_epg_hdr;
 		tlshdr->tls_vmajor = tls->params.tls_vmajor;
 
 		/*
 		 * TLS 1.3 masquarades as TLS 1.2 with a record type
 		 * of TLS_RLTYPE_APP.
 		 */
 		if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
 		    tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
 			tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
 			tlshdr->tls_type = TLS_RLTYPE_APP;
 			/* save the real record type for later */
 			m->m_epg_record_type = record_type;
 			m->m_epg_trail[0] = record_type;
 		} else {
 			tlshdr->tls_vminor = tls->params.tls_vminor;
 			tlshdr->tls_type = record_type;
 		}
 		tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
 
 		/*
 		 * Store nonces / explicit IVs after the end of the
 		 * TLS header.
 		 *
 		 * For GCM with TLS 1.2, an 8 byte nonce is copied
 		 * from the end of the IV.  The nonce is then
 		 * incremented for use by the next record.
 		 *
 		 * For CBC, a random nonce is inserted for TLS 1.1+.
 		 */
 		if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    tls->params.tls_vminor == TLS_MINOR_VER_TWO) {
 			noncep = (uint64_t *)(tls->params.iv + 8);
 			be64enc(tlshdr + 1, *noncep);
 			(*noncep)++;
 		} else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 		    tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
 			arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
 
 		/*
 		 * When using SW encryption, mark the mbuf not ready.
 		 * It will be marked ready via sbready() after the
 		 * record has been encrypted.
 		 *
 		 * When using ifnet TLS, unencrypted TLS records are
 		 * sent down the stack to the NIC.
 		 */
 		if (tls->mode == TCP_TLS_MODE_SW) {
 			m->m_flags |= M_NOTREADY;
 			if (__predict_false(tls_len == 0)) {
 				/* TLS 1.0 empty fragment. */
 				m->m_epg_nrdy = 1;
 			} else
 				m->m_epg_nrdy = m->m_epg_npgs;
 			*enq_cnt += m->m_epg_nrdy;
 		}
 	}
 }
 
 void
 ktls_check_rx(struct sockbuf *sb)
 {
 	struct tls_record_layer hdr;
 	struct ktls_wq *wq;
 	struct socket *so;
 	bool running;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
 	    __func__, sb));
 	so = __containerof(sb, struct socket, so_rcv);
 
 	if (sb->sb_flags & SB_TLS_RX_RUNNING)
 		return;
 
 	/* Is there enough queued for a TLS header? */
 	if (sb->sb_tlscc < sizeof(hdr)) {
 		if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0)
 			so->so_error = EMSGSIZE;
 		return;
 	}
 
 	m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr);
 
 	/* Is the entire record queued? */
 	if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) {
 		if ((sb->sb_state & SBS_CANTRCVMORE) != 0)
 			so->so_error = EMSGSIZE;
 		return;
 	}
 
 	sb->sb_flags |= SB_TLS_RX_RUNNING;
 
 	soref(so);
 	wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_rx_queued, 1);
 }
 
 static struct mbuf *
 ktls_detach_record(struct sockbuf *sb, int len)
 {
 	struct mbuf *m, *n, *top;
 	int remain;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	MPASS(len <= sb->sb_tlscc);
 
 	/*
 	 * If TLS chain is the exact size of the record,
 	 * just grab the whole record.
 	 */
 	top = sb->sb_mtls;
 	if (sb->sb_tlscc == len) {
 		sb->sb_mtls = NULL;
 		sb->sb_mtlstail = NULL;
 		goto out;
 	}
 
 	/*
 	 * While it would be nice to use m_split() here, we need
 	 * to know exactly what m_split() allocates to update the
 	 * accounting, so do it inline instead.
 	 */
 	remain = len;
 	for (m = top; remain > m->m_len; m = m->m_next)
 		remain -= m->m_len;
 
 	/* Easy case: don't have to split 'm'. */
 	if (remain == m->m_len) {
 		sb->sb_mtls = m->m_next;
 		if (sb->sb_mtls == NULL)
 			sb->sb_mtlstail = NULL;
 		m->m_next = NULL;
 		goto out;
 	}
 
 	/*
 	 * Need to allocate an mbuf to hold the remainder of 'm'.  Try
 	 * with M_NOWAIT first.
 	 */
 	n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL) {
 		/*
 		 * Use M_WAITOK with socket buffer unlocked.  If
 		 * 'sb_mtls' changes while the lock is dropped, return
 		 * NULL to force the caller to retry.
 		 */
 		SOCKBUF_UNLOCK(sb);
 
 		n = m_get(M_WAITOK, MT_DATA);
 
 		SOCKBUF_LOCK(sb);
 		if (sb->sb_mtls != top) {
 			m_free(n);
 			return (NULL);
 		}
 	}
 	n->m_flags |= M_NOTREADY;
 
 	/* Store remainder in 'n'. */
 	n->m_len = m->m_len - remain;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data + remain;
 		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len);
 	}
 
 	/* Trim 'm' and update accounting. */
 	m->m_len -= n->m_len;
 	sb->sb_tlscc -= n->m_len;
 	sb->sb_ccc -= n->m_len;
 
 	/* Account for 'n'. */
 	sballoc_ktls_rx(sb, n);
 
 	/* Insert 'n' into the TLS chain. */
 	sb->sb_mtls = n;
 	n->m_next = m->m_next;
 	if (sb->sb_mtlstail == m)
 		sb->sb_mtlstail = n;
 
 	/* Detach the record from the TLS chain. */
 	m->m_next = NULL;
 
 out:
 	MPASS(m_length(top, NULL) == len);
 	for (m = top; m != NULL; m = m->m_next)
 		sbfree_ktls_rx(sb, m);
 	sb->sb_tlsdcc = len;
 	sb->sb_ccc += len;
 	SBCHECK(sb);
 	return (top);
 }
 
 static void
 ktls_decrypt(struct socket *so)
 {
 	char tls_header[MBUF_PEXT_HDR_LEN];
 	struct ktls_session *tls;
 	struct sockbuf *sb;
 	struct tls_record_layer *hdr;
 	struct tls_get_record tgr;
 	struct mbuf *control, *data, *m;
 	uint64_t seqno;
 	int error, remain, tls_len, trail_len;
 
 	hdr = (struct tls_record_layer *)tls_header;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING,
 	    ("%s: socket %p not running", __func__, so));
 
 	tls = sb->sb_tls_info;
 	MPASS(tls != NULL);
 
 	for (;;) {
 		/* Is there enough queued for a TLS header? */
 		if (sb->sb_tlscc < tls->params.tls_hlen)
 			break;
 
 		m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header);
 		tls_len = sizeof(*hdr) + ntohs(hdr->tls_length);
 
 		if (hdr->tls_vmajor != tls->params.tls_vmajor ||
 		    hdr->tls_vminor != tls->params.tls_vminor)
 			error = EINVAL;
 		else if (tls_len < tls->params.tls_hlen || tls_len >
 		    tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 +
 		    tls->params.tls_tlen)
 			error = EMSGSIZE;
 		else
 			error = 0;
 		if (__predict_false(error != 0)) {
 			/*
 			 * We have a corrupted record and are likely
 			 * out of sync.  The connection isn't
 			 * recoverable at this point, so abort it.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			counter_u64_add(ktls_offload_corrupted_records, 1);
 
 			CURVNET_SET(so->so_vnet);
 			so->so_proto->pr_usrreqs->pru_abort(so);
 			so->so_error = error;
 			CURVNET_RESTORE();
 			goto deref;
 		}
 
 		/* Is the entire record queued? */
 		if (sb->sb_tlscc < tls_len)
 			break;
 
 		/*
 		 * Split out the portion of the mbuf chain containing
 		 * this TLS record.
 		 */
 		data = ktls_detach_record(sb, tls_len);
 		if (data == NULL)
 			continue;
 		MPASS(sb->sb_tlsdcc == tls_len);
 
 		seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 		SBCHECK(sb);
 		SOCKBUF_UNLOCK(sb);
 
 		error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 
 			SOCKBUF_LOCK(sb);
 			if (sb->sb_tlsdcc == 0) {
 				/*
 				 * sbcut/drop/flush discarded these
 				 * mbufs.
 				 */
 				m_freem(data);
 				break;
 			}
 
 			/*
 			 * Drop this TLS record's data, but keep
 			 * decrypting subsequent records.
 			 */
 			sb->sb_ccc -= tls_len;
 			sb->sb_tlsdcc = 0;
 
 			CURVNET_SET(so->so_vnet);
 			so->so_error = EBADMSG;
 			sorwakeup_locked(so);
 			CURVNET_RESTORE();
 
 			m_freem(data);
 
 			SOCKBUF_LOCK(sb);
 			continue;
 		}
 
 		/* Allocate the control mbuf. */
 		tgr.tls_type = hdr->tls_type;
 		tgr.tls_vmajor = hdr->tls_vmajor;
 		tgr.tls_vminor = hdr->tls_vminor;
 		tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen -
 		    trail_len);
 		control = sbcreatecontrol_how(&tgr, sizeof(tgr),
 		    TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK);
 
 		SOCKBUF_LOCK(sb);
 		if (sb->sb_tlsdcc == 0) {
 			/* sbcut/drop/flush discarded these mbufs. */
 			MPASS(sb->sb_tlscc == 0);
 			m_freem(data);
 			m_freem(control);
 			break;
 		}
 
 		/*
 		 * Clear the 'dcc' accounting in preparation for
 		 * adding the decrypted record.
 		 */
 		sb->sb_ccc -= tls_len;
 		sb->sb_tlsdcc = 0;
 		SBCHECK(sb);
 
 		/* If there is no payload, drop all of the data. */
 		if (tgr.tls_length == htobe16(0)) {
 			m_freem(data);
 			data = NULL;
 		} else {
 			/* Trim header. */
 			remain = tls->params.tls_hlen;
 			while (remain > 0) {
 				if (data->m_len > remain) {
 					data->m_data += remain;
 					data->m_len -= remain;
 					break;
 				}
 				remain -= data->m_len;
 				data = m_free(data);
 			}
 
 			/* Trim trailer and clear M_NOTREADY. */
 			remain = be16toh(tgr.tls_length);
 			m = data;
 			for (m = data; remain > m->m_len; m = m->m_next) {
 				m->m_flags &= ~M_NOTREADY;
 				remain -= m->m_len;
 			}
 			m->m_len = remain;
 			m_freem(m->m_next);
 			m->m_next = NULL;
 			m->m_flags &= ~M_NOTREADY;
 
 			/* Set EOR on the final mbuf. */
 			m->m_flags |= M_EOR;
 		}
 
 		sbappendcontrol_locked(sb, data, control, 0);
 	}
 
 	sb->sb_flags &= ~SB_TLS_RX_RUNNING;
 
 	if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0)
 		so->so_error = EMSGSIZE;
 
 	sorwakeup_locked(so);
 
 deref:
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	CURVNET_SET(so->so_vnet);
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 ktls_enqueue_to_free(struct mbuf *m)
 {
 	struct ktls_wq *wq;
 	bool running;
 
 	/* Mark it for freeing. */
 	m->m_epg_flags |= EPG_FLAG_2FREE;
 	wq = &ktls_wq[m->m_epg_tls->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 }
 
 static void *
 ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m)
 {
 	void *buf;
 	int domain, running;
 
 	if (m->m_epg_npgs <= 2)
 		return (NULL);
 	if (ktls_buffer_zone == NULL)
 		return (NULL);
 	if ((u_int)(ticks - wq->lastallocfail) < hz) {
 		/*
 		 * Rate-limit allocation attempts after a failure.
 		 * ktls_buffer_import() will acquire a per-domain mutex to check
 		 * the free page queues and may fail consistently if memory is
 		 * fragmented.
 		 */
 		return (NULL);
 	}
 	buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM);
 	if (buf == NULL) {
 		domain = PCPU_GET(domain);
 		wq->lastallocfail = ticks;
 
 		/*
 		 * Note that this check is "racy", but the races are
 		 * harmless, and are either a spurious wakeup if
 		 * multiple threads fail allocations before the alloc
 		 * thread wakes, or waiting an extra second in case we
 		 * see an old value of running == true.
 		 */
 		if (!VM_DOMAIN_EMPTY(domain)) {
 			running = atomic_load_int(&ktls_domains[domain].alloc_td.running);
 			if (!running)
 				wakeup(&ktls_domains[domain].alloc_td);
 		}
 	}
 	return (buf);
 }
 
 static int
 ktls_encrypt_record(struct ktls_wq *wq, struct mbuf *m,
     struct ktls_session *tls, struct ktls_ocf_encrypt_state *state)
 {
 	vm_page_t pg;
 	int error, i, len, off;
 
 	KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY),
 	    ("%p not unready & nomap mbuf\n", m));
 	KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen,
 	    ("page count %d larger than maximum frame length %d", m->m_epg_npgs,
 	    ktls_maxlen));
 
 	/* Anonymous mbufs are encrypted in place. */
 	if ((m->m_epg_flags & EPG_FLAG_ANON) != 0)
 		return (tls->sw_encrypt(state, tls, m, NULL, 0));
 
 	/*
 	 * For file-backed mbufs (from sendfile), anonymous wired
 	 * pages are allocated and used as the encryption destination.
 	 */
 	if ((state->cbuf = ktls_buffer_alloc(wq, m)) != NULL) {
 		len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len -
 		    m->m_epg_1st_off;
 		state->dst_iov[0].iov_base = (char *)state->cbuf +
 		    m->m_epg_1st_off;
 		state->dst_iov[0].iov_len = len;
 		state->parray[0] = DMAP_TO_PHYS((vm_offset_t)state->cbuf);
 		i = 1;
 	} else {
 		off = m->m_epg_1st_off;
 		for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
 			pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP |
 			    VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
 			len = m_epg_pagelen(m, i, off);
 			state->parray[i] = VM_PAGE_TO_PHYS(pg);
 			state->dst_iov[i].iov_base =
 			    (char *)PHYS_TO_DMAP(state->parray[i]) + off;
 			state->dst_iov[i].iov_len = len;
 		}
 	}
 	KASSERT(i + 1 <= nitems(state->dst_iov), ("dst_iov is too small"));
 	state->dst_iov[i].iov_base = m->m_epg_trail;
 	state->dst_iov[i].iov_len = m->m_epg_trllen;
 
 	error = tls->sw_encrypt(state, tls, m, state->dst_iov, i + 1);
 
 	if (__predict_false(error != 0)) {
 		/* Free the anonymous pages. */
 		if (state->cbuf != NULL)
 			uma_zfree(ktls_buffer_zone, state->cbuf);
 		else {
 			for (i = 0; i < m->m_epg_npgs; i++) {
 				pg = PHYS_TO_VM_PAGE(state->parray[i]);
 				(void)vm_page_unwire_noq(pg);
 				vm_page_free(pg);
 			}
 		}
 	}
 	return (error);
 }
 
 /* Number of TLS records in a batch passed to ktls_enqueue(). */
 static u_int
 ktls_batched_records(struct mbuf *m)
 {
 	int page_count, records;
 
 	records = 0;
 	page_count = m->m_epg_enc_cnt;
 	while (page_count > 0) {
 		records++;
 		page_count -= m->m_epg_nrdy;
 		m = m->m_next;
 	}
 	KASSERT(page_count == 0, ("%s: mismatched page count", __func__));
 	return (records);
 }
 
 void
 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
 {
 	struct ktls_session *tls;
 	struct ktls_wq *wq;
 	int queued;
 	bool running;
 
 	KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
 	    (M_EXTPG | M_NOTREADY)),
 	    ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
 	KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
 
 	KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
 
 	m->m_epg_enc_cnt = page_count;
 
 	/*
 	 * Save a pointer to the socket.  The caller is responsible
 	 * for taking an additional reference via soref().
 	 */
 	m->m_epg_so = so;
 
 	queued = 1;
 	tls = m->m_epg_tls;
 	wq = &ktls_wq[tls->wq_index];
 	mtx_lock(&wq->mtx);
 	if (__predict_false(tls->sequential_records)) {
 		/*
 		 * For TLS 1.0, records must be encrypted
 		 * sequentially.  For a given connection, all records
 		 * queued to the associated work queue are processed
 		 * sequentially.  However, sendfile(2) might complete
 		 * I/O requests spanning multiple TLS records out of
 		 * order.  Here we ensure TLS records are enqueued to
 		 * the work queue in FIFO order.
 		 *
 		 * tls->next_seqno holds the sequence number of the
 		 * next TLS record that should be enqueued to the work
 		 * queue.  If this next record is not tls->next_seqno,
 		 * it must be a future record, so insert it, sorted by
 		 * TLS sequence number, into tls->pending_records and
 		 * return.
 		 *
 		 * If this TLS record matches tls->next_seqno, place
 		 * it in the work queue and then check
 		 * tls->pending_records to see if any
 		 * previously-queued records are now ready for
 		 * encryption.
 		 */
 		if (m->m_epg_seqno != tls->next_seqno) {
 			struct mbuf *n, *p;
 
 			p = NULL;
 			STAILQ_FOREACH(n, &tls->pending_records, m_epg_stailq) {
 				if (n->m_epg_seqno > m->m_epg_seqno)
 					break;
 				p = n;
 			}
 			if (n == NULL)
 				STAILQ_INSERT_TAIL(&tls->pending_records, m,
 				    m_epg_stailq);
 			else if (p == NULL)
 				STAILQ_INSERT_HEAD(&tls->pending_records, m,
 				    m_epg_stailq);
 			else
 				STAILQ_INSERT_AFTER(&tls->pending_records, p, m,
 				    m_epg_stailq);
 			mtx_unlock(&wq->mtx);
 			counter_u64_add(ktls_cnt_tx_pending, 1);
 			return;
 		}
 
 		tls->next_seqno += ktls_batched_records(m);
 		STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 
 		while (!STAILQ_EMPTY(&tls->pending_records)) {
 			struct mbuf *n;
 
 			n = STAILQ_FIRST(&tls->pending_records);
 			if (n->m_epg_seqno != tls->next_seqno)
 				break;
 
 			queued++;
 			STAILQ_REMOVE_HEAD(&tls->pending_records, m_epg_stailq);
 			tls->next_seqno += ktls_batched_records(n);
 			STAILQ_INSERT_TAIL(&wq->m_head, n, m_epg_stailq);
 		}
 		counter_u64_add(ktls_cnt_tx_pending, -(queued - 1));
 	} else
 		STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_tx_queued, queued);
 }
 
 /*
  * Once a file-backed mbuf (from sendfile) has been encrypted, free
  * the pages from the file and replace them with the anonymous pages
  * allocated in ktls_encrypt_record().
  */
 static void
 ktls_finish_nonanon(struct mbuf *m, struct ktls_ocf_encrypt_state *state)
 {
 	int i;
 
 	MPASS((m->m_epg_flags & EPG_FLAG_ANON) == 0);
 
 	/* Free the old pages. */
 	m->m_ext.ext_free(m);
 
 	/* Replace them with the new pages. */
 	if (state->cbuf != NULL) {
 		for (i = 0; i < m->m_epg_npgs; i++)
 			m->m_epg_pa[i] = state->parray[0] + ptoa(i);
 
 		/* Contig pages should go back to the cache. */
 		m->m_ext.ext_free = ktls_free_mext_contig;
 	} else {
 		for (i = 0; i < m->m_epg_npgs; i++)
 			m->m_epg_pa[i] = state->parray[i];
 
 		/* Use the basic free routine. */
 		m->m_ext.ext_free = mb_free_mext_pgs;
 	}
 
 	/* Pages are now writable. */
 	m->m_epg_flags |= EPG_FLAG_ANON;
 }
 
 static __noinline void
 ktls_encrypt(struct ktls_wq *wq, struct mbuf *top)
 {
 	struct ktls_ocf_encrypt_state state;
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m;
 	int error, npages, total_pages;
 
 	so = top->m_epg_so;
 	tls = top->m_epg_tls;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 	KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 #ifdef INVARIANTS
 	top->m_epg_so = NULL;
 #endif
 	total_pages = top->m_epg_enc_cnt;
 	npages = 0;
 
 	/*
 	 * Encrypt the TLS records in the chain of mbufs starting with
 	 * 'top'.  'total_pages' gives us a total count of pages and is
 	 * used to know when we have finished encrypting the TLS
 	 * records originally queued with 'top'.
 	 *
 	 * NB: These mbufs are queued in the socket buffer and
 	 * 'm_next' is traversing the mbufs in the socket buffer.  The
 	 * socket buffer lock is not held while traversing this chain.
 	 * Since the mbufs are all marked M_NOTREADY their 'm_next'
 	 * pointers should be stable.  However, the 'm_next' of the
 	 * last mbuf encrypted is not necessarily NULL.  It can point
 	 * to other mbufs appended while 'top' was on the TLS work
 	 * queue.
 	 *
 	 * Each mbuf holds an entire TLS record.
 	 */
 	error = 0;
 	for (m = top; npages != total_pages; m = m->m_next) {
 		KASSERT(m->m_epg_tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
 		    tls, m->m_epg_tls));
 		KASSERT(npages + m->m_epg_npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		error = ktls_encrypt_record(wq, m, tls, &state);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			break;
 		}
 
 		if ((m->m_epg_flags & EPG_FLAG_ANON) == 0)
 			ktls_finish_nonanon(m, &state);
 
 		npages += m->m_epg_nrdy;
 
 		/*
 		 * Drop a reference to the session now that it is no
 		 * longer needed.  Existing code depends on encrypted
 		 * records having no associated session vs
 		 * yet-to-be-encrypted records having an associated
 		 * session.
 		 */
 		m->m_epg_tls = NULL;
 		ktls_free(tls);
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error == 0) {
 		(void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages);
 	} else {
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(top, total_pages);
 	}
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 ktls_encrypt_cb(struct ktls_ocf_encrypt_state *state, int error)
 {
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m;
 	int npages;
 
 	m = state->m;
 
 	if ((m->m_epg_flags & EPG_FLAG_ANON) == 0)
 		ktls_finish_nonanon(m, state);
 
 	so = state->so;
 	free(state, M_KTLS);
 
 	/*
 	 * Drop a reference to the session now that it is no longer
 	 * needed.  Existing code depends on encrypted records having
 	 * no associated session vs yet-to-be-encrypted records having
 	 * an associated session.
 	 */
 	tls = m->m_epg_tls;
 	m->m_epg_tls = NULL;
 	ktls_free(tls);
 
 	if (error != 0)
 		counter_u64_add(ktls_offload_failed_crypto, 1);
 
 	CURVNET_SET(so->so_vnet);
 	npages = m->m_epg_nrdy;
 
 	if (error == 0) {
 		(void)(*so->so_proto->pr_usrreqs->pru_ready)(so, m, npages);
 	} else {
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(m, npages);
 	}
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 /*
  * Similar to ktls_encrypt, but used with asynchronous OCF backends
  * (coprocessors) where encryption does not use host CPU resources and
  * it can be beneficial to queue more requests than CPUs.
  */
 static __noinline void
 ktls_encrypt_async(struct ktls_wq *wq, struct mbuf *top)
 {
 	struct ktls_ocf_encrypt_state *state;
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m, *n;
 	int error, mpages, npages, total_pages;
 
 	so = top->m_epg_so;
 	tls = top->m_epg_tls;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 	KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 #ifdef INVARIANTS
 	top->m_epg_so = NULL;
 #endif
 	total_pages = top->m_epg_enc_cnt;
 	npages = 0;
 
 	error = 0;
 	for (m = top; npages != total_pages; m = n) {
 		KASSERT(m->m_epg_tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
 		    tls, m->m_epg_tls));
 		KASSERT(npages + m->m_epg_npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		state = malloc(sizeof(*state), M_KTLS, M_WAITOK | M_ZERO);
 		soref(so);
 		state->so = so;
 		state->m = m;
 
 		mpages = m->m_epg_nrdy;
 		n = m->m_next;
 
 		error = ktls_encrypt_record(wq, m, tls, state);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			free(state, M_KTLS);
 			CURVNET_SET(so->so_vnet);
 			SOCK_LOCK(so);
 			sorele(so);
 			CURVNET_RESTORE();
 			break;
 		}
 
 		npages += mpages;
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error != 0) {
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(m, total_pages - npages);
 	}
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 static int
 ktls_bind_domain(int domain)
 {
 	int error;
 
 	error = cpuset_setthread(curthread->td_tid, &cpuset_domain[domain]);
 	if (error != 0)
 		return (error);
 	curthread->td_domain.dr_policy = DOMAINSET_PREF(domain);
 	return (0);
 }
 
 static void
 ktls_alloc_thread(void *ctx)
 {
 	struct ktls_domain_info *ktls_domain = ctx;
 	struct ktls_alloc_thread *sc = &ktls_domain->alloc_td;
 	void **buf;
 	struct sysctl_oid *oid;
 	char name[80];
 	int domain, error, i, nbufs;
 
 	domain = ktls_domain - ktls_domains;
 	if (bootverbose)
 		printf("Starting KTLS alloc thread for domain %d\n", domain);
 	error = ktls_bind_domain(domain);
 	if (error)
 		printf("Unable to bind KTLS alloc thread for domain %d: error %d\n",
 		    domain, error);
 	snprintf(name, sizeof(name), "domain%d", domain);
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO,
 	    name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs",
 	    CTLFLAG_RD,  &sc->allocs, 0, "buffers allocated");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups",
 	    CTLFLAG_RD,  &sc->wakeups, 0, "thread wakeups");
 	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running",
 	    CTLFLAG_RD,  &sc->running, 0, "thread running");
 
 	buf = NULL;
 	nbufs = 0;
 	for (;;) {
 		atomic_store_int(&sc->running, 0);
 		tsleep(sc, PZERO | PNOLOCK, "-",  0);
 		atomic_store_int(&sc->running, 1);
 		sc->wakeups++;
 		if (nbufs != ktls_max_alloc) {
 			free(buf, M_KTLS);
 			nbufs = atomic_load_int(&ktls_max_alloc);
 			buf = malloc(sizeof(void *) * nbufs, M_KTLS,
 			    M_WAITOK | M_ZERO);
 		}
 		/*
 		 * Below we allocate nbufs with different allocation
 		 * flags than we use when allocating normally during
 		 * encryption in the ktls worker thread.  We specify
 		 * M_NORECLAIM in the worker thread. However, we omit
 		 * that flag here and add M_WAITOK so that the VM
 		 * system is permitted to perform expensive work to
 		 * defragment memory.  We do this here, as it does not
 		 * matter if this thread blocks.  If we block a ktls
 		 * worker thread, we risk developing backlogs of
 		 * buffers to be encrypted, leading to surges of
 		 * traffic and potential NIC output drops.
 		 */
 		for (i = 0; i < nbufs; i++) {
 			buf[i] = uma_zalloc(ktls_buffer_zone, M_WAITOK);
 			sc->allocs++;
 		}
 		for (i = 0; i < nbufs; i++) {
 			uma_zfree(ktls_buffer_zone, buf[i]);
 			buf[i] = NULL;
 		}
 	}
 }
 
 static void
 ktls_work_thread(void *ctx)
 {
 	struct ktls_wq *wq = ctx;
 	struct mbuf *m, *n;
 	struct socket *so, *son;
 	STAILQ_HEAD(, mbuf) local_m_head;
 	STAILQ_HEAD(, socket) local_so_head;
 	int cpu;
 
 	cpu = wq - ktls_wq;
 	if (bootverbose)
 		printf("Starting KTLS worker thread for CPU %d\n", cpu);
 
 	/*
 	 * Bind to a core.  If ktls_bind_threads is > 1, then
 	 * we bind to the NUMA domain instead.
 	 */
 	if (ktls_bind_threads) {
 		int error;
 
 		if (ktls_bind_threads > 1) {
 			struct pcpu *pc = pcpu_find(cpu);
 
 			error = ktls_bind_domain(pc->pc_domain);
 		} else {
 			cpuset_t mask;
 
 			CPU_SETOF(cpu, &mask);
 			error = cpuset_setthread(curthread->td_tid, &mask);
 		}
 		if (error)
 			printf("Unable to bind KTLS worker thread for CPU %d: error %d\n",
 				cpu, error);
 	}
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 	fpu_kern_thread(0);
 #endif
 	for (;;) {
 		mtx_lock(&wq->mtx);
 		while (STAILQ_EMPTY(&wq->m_head) &&
 		    STAILQ_EMPTY(&wq->so_head)) {
 			wq->running = false;
 			mtx_sleep(wq, &wq->mtx, 0, "-", 0);
 			wq->running = true;
 		}
 
 		STAILQ_INIT(&local_m_head);
 		STAILQ_CONCAT(&local_m_head, &wq->m_head);
 		STAILQ_INIT(&local_so_head);
 		STAILQ_CONCAT(&local_so_head, &wq->so_head);
 		mtx_unlock(&wq->mtx);
 
 		STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) {
 			if (m->m_epg_flags & EPG_FLAG_2FREE) {
 				ktls_free(m->m_epg_tls);
 				m_free_raw(m);
 			} else {
 				if (m->m_epg_tls->sync_dispatch)
 					ktls_encrypt(wq, m);
 				else
 					ktls_encrypt_async(wq, m);
 				counter_u64_add(ktls_cnt_tx_queued, -1);
 			}
 		}
 
 		STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) {
 			ktls_decrypt(so);
 			counter_u64_add(ktls_cnt_rx_queued, -1);
 		}
 	}
 }
 
 #if defined(INET) || defined(INET6)
 static void
 ktls_disable_ifnet_help(void *context, int pending __unused)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	int err;
 
 	tls = context;
 	inp = tls->inp;
 	if (inp == NULL)
 		return;
 	INP_WLOCK(inp);
 	so = inp->inp_socket;
 	MPASS(so != NULL);
 	if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
 	    (inp->inp_flags2 & INP_FREED)) {
 		goto out;
 	}
 
 	if (so->so_snd.sb_tls_info != NULL)
 		err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
 	else
 		err = ENXIO;
 	if (err == 0) {
 		counter_u64_add(ktls_ifnet_disable_ok, 1);
 		/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
 		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
 		    (inp->inp_flags2 & INP_FREED) == 0 &&
 		    (tp = intotcpcb(inp)) != NULL &&
 		    tp->t_fb->tfb_hwtls_change != NULL)
 			(*tp->t_fb->tfb_hwtls_change)(tp, 0);
 	} else {
 		counter_u64_add(ktls_ifnet_disable_fail, 1);
 	}
 
 out:
 	SOCK_LOCK(so);
 	sorele(so);
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 	ktls_free(tls);
 }
 
 /*
  * Called when re-transmits are becoming a substantial portion of the
  * sends on this connection.  When this happens, we transition the
  * connection to software TLS.  This is needed because most inline TLS
  * NICs keep crypto state only for in-order transmits.  This means
  * that to handle a TCP rexmit (which is out-of-order), the NIC must
  * re-DMA the entire TLS record up to and including the current
  * segment.  This means that when re-transmitting the last ~1448 byte
  * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
  * of magnitude more data than we are sending.  This can cause the
  * PCIe link to saturate well before the network, which can cause
  * output drops, and a general loss of capacity.
  */
 void
 ktls_disable_ifnet(void *arg)
 {
 	struct tcpcb *tp;
 	struct inpcb *inp;
 	struct socket *so;
 	struct ktls_session *tls;
 
 	tp = arg;
 	inp = tp->t_inpcb;
 	INP_WLOCK_ASSERT(inp);
 	so = inp->inp_socket;
 	SOCK_LOCK(so);
 	tls = so->so_snd.sb_tls_info;
 	if (tls->disable_ifnet_pending) {
 		SOCK_UNLOCK(so);
 		return;
 	}
 
 	/*
 	 * note that disable_ifnet_pending is never cleared; disabling
 	 * ifnet can only be done once per session, so we never want
 	 * to do it again
 	 */
 
 	(void)ktls_hold(tls);
 	in_pcbref(inp);
 	soref(so);
 	tls->disable_ifnet_pending = true;
 	tls->inp = inp;
 	SOCK_UNLOCK(so);
 	TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
 	(void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
 }
 #endif
diff --git a/sys/powerpc/aim/mmu_radix.c b/sys/powerpc/aim/mmu_radix.c
index d29ca730d7d6..95c3f5f009e4 100644
--- a/sys/powerpc/aim/mmu_radix.c
+++ b/sys/powerpc/aim/mmu_radix.c
@@ -1,6462 +1,6462 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2018 Matthew Macy
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/bitstring.h>
 #include <sys/queue.h>
 #include <sys/cpuset.h>
 #include <sys/endian.h>
 #include <sys/kerneldump.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/syslog.h>
 #include <sys/msgbuf.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/smp.h>
 
 #include <sys/kdb.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_reserv.h>
 #include <vm/vm_dumpset.h>
 #include <vm/uma.h>
 
 #include <machine/_inttypes.h>
 #include <machine/cpu.h>
 #include <machine/platform.h>
 #include <machine/frame.h>
 #include <machine/md_var.h>
 #include <machine/psl.h>
 #include <machine/bat.h>
 #include <machine/hid.h>
 #include <machine/pte.h>
 #include <machine/sr.h>
 #include <machine/trap.h>
 #include <machine/mmuvar.h>
 
 /* For pseries bit. */
 #include <powerpc/pseries/phyp-hvcall.h>
 
 #ifdef INVARIANTS
 #include <vm/uma_dbg.h>
 #endif
 
 #define PPC_BITLSHIFT(bit)	(sizeof(long)*NBBY - 1 - (bit))
 #define PPC_BIT(bit)		(1UL << PPC_BITLSHIFT(bit))
 #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit))
 
 #include "opt_ddb.h"
 
 #ifdef DDB
 static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va);
 #endif
 
 #define PG_W	RPTE_WIRED
 #define PG_V	RPTE_VALID
 #define PG_MANAGED	RPTE_MANAGED
 #define PG_PROMOTED	RPTE_PROMOTED
 #define PG_M	RPTE_C
 #define PG_A	RPTE_R
 #define PG_X	RPTE_EAA_X
 #define PG_RW	RPTE_EAA_W
 #define PG_PTE_CACHE RPTE_ATTR_MASK
 
 #define RPTE_SHIFT 9
 #define NLS_MASK ((1UL<<5)-1)
 #define RPTE_ENTRIES (1UL<<RPTE_SHIFT)
 #define RPTE_MASK (RPTE_ENTRIES-1)
 
 #define NLB_SHIFT 0
 #define NLB_MASK (((1UL<<52)-1) << 8)
 
 extern int nkpt;
 extern caddr_t crashdumpmap;
 
 #define RIC_FLUSH_TLB 0
 #define RIC_FLUSH_PWC 1
 #define RIC_FLUSH_ALL 2
 
 #define POWER9_TLB_SETS_RADIX	128	/* # sets in POWER9 TLB Radix mode */
 
 #define PPC_INST_TLBIE			0x7c000264
 #define PPC_INST_TLBIEL			0x7c000224
 #define PPC_INST_SLBIA			0x7c0003e4
 
 #define ___PPC_RA(a)	(((a) & 0x1f) << 16)
 #define ___PPC_RB(b)	(((b) & 0x1f) << 11)
 #define ___PPC_RS(s)	(((s) & 0x1f) << 21)
 #define ___PPC_RT(t)	___PPC_RS(t)
 #define ___PPC_R(r)	(((r) & 0x1) << 16)
 #define ___PPC_PRS(prs)	(((prs) & 0x1) << 17)
 #define ___PPC_RIC(ric)	(((ric) & 0x3) << 18)
 
 #define PPC_SLBIA(IH)	__XSTRING(.long PPC_INST_SLBIA | \
 				       ((IH & 0x7) << 21))
 #define	PPC_TLBIE_5(rb,rs,ric,prs,r)				\
 	__XSTRING(.long PPC_INST_TLBIE |			\
 			  ___PPC_RB(rb) | ___PPC_RS(rs) |	\
 			  ___PPC_RIC(ric) | ___PPC_PRS(prs) |	\
 			  ___PPC_R(r))
 
 #define	PPC_TLBIEL(rb,rs,ric,prs,r) \
 	 __XSTRING(.long PPC_INST_TLBIEL | \
 			   ___PPC_RB(rb) | ___PPC_RS(rs) |	\
 			   ___PPC_RIC(ric) | ___PPC_PRS(prs) |	\
 			   ___PPC_R(r))
 
 #define PPC_INVALIDATE_ERAT		PPC_SLBIA(7)
 
 static __inline void
 ttusync(void)
 {
 	__asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
 }
 
 #define TLBIEL_INVAL_SEL_MASK	0xc00	/* invalidation selector */
 #define  TLBIEL_INVAL_PAGE	0x000	/* invalidate a single page */
 #define  TLBIEL_INVAL_SET_PID	0x400	/* invalidate a set for the current PID */
 #define  TLBIEL_INVAL_SET_LPID	0x800	/* invalidate a set for current LPID */
 #define  TLBIEL_INVAL_SET	0xc00	/* invalidate a set for all LPIDs */
 
 #define TLBIE_ACTUAL_PAGE_MASK		0xe0
 #define  TLBIE_ACTUAL_PAGE_4K		0x00
 #define  TLBIE_ACTUAL_PAGE_64K		0xa0
 #define  TLBIE_ACTUAL_PAGE_2M		0x20
 #define  TLBIE_ACTUAL_PAGE_1G		0x40
 
 #define TLBIE_PRS_PARTITION_SCOPE	0x0
 #define TLBIE_PRS_PROCESS_SCOPE	0x1
 
 #define TLBIE_RIC_INVALIDATE_TLB	0x0	/* Invalidate just TLB */
 #define TLBIE_RIC_INVALIDATE_PWC	0x1	/* Invalidate just PWC */
 #define TLBIE_RIC_INVALIDATE_ALL	0x2	/* Invalidate TLB, PWC,
 						 * cached {proc, part}tab entries
 						 */
 #define TLBIE_RIC_INVALIDATE_SEQ	0x3	/* HPT - only:
 						 * Invalidate a range of translations
 						 */
 
 static __always_inline void
 radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid,
 			vm_offset_t va, uint16_t ap)
 {
 	uint64_t rb, rs;
 
 	MPASS((va & PAGE_MASK) == 0);
 
 	rs = ((uint64_t)pid << 32) | lpid;
 	rb = va | is | ap;
 	__asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : :
 		"r" (rb), "r" (rs), "i" (ric), "i" (prs) : "memory");
 }
 
 static __inline void
 radix_tlbie_fixup(uint32_t pid, vm_offset_t va, int ap)
 {
 
 	__asm __volatile("ptesync" ::: "memory");
 	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
 	    TLBIEL_INVAL_PAGE, 0, 0, va, ap);
 	__asm __volatile("ptesync" ::: "memory");
 	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
 	    TLBIEL_INVAL_PAGE, pid, 0, va, ap);
 }
 
 static __inline void
 radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va)
 {
 
 	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
 		TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K);
 	radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_4K);
 }
 
 static __inline void
 radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va)
 {
 
 	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
 		TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M);
 	radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_2M);
 }
 
 static __inline void
 radix_tlbie_invlpwc_user(uint32_t pid)
 {
 
 	radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
 		TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
 }
 
 static __inline void
 radix_tlbie_flush_user(uint32_t pid)
 {
 
 	radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
 		TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
 }
 
 static __inline void
 radix_tlbie_invlpg_kernel_4k(vm_offset_t va)
 {
 
 	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
 	    TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K);
 	radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_4K);
 }
 
 static __inline void
 radix_tlbie_invlpg_kernel_2m(vm_offset_t va)
 {
 
 	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
 	    TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M);
 	radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_2M);
 }
 
 /* 1GB pages aren't currently supported. */
 static __inline __unused void
 radix_tlbie_invlpg_kernel_1g(vm_offset_t va)
 {
 
 	radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
 	    TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G);
 	radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_1G);
 }
 
 static __inline void
 radix_tlbie_invlpwc_kernel(void)
 {
 
 	radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
 	    TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
 }
 
 static __inline void
 radix_tlbie_flush_kernel(void)
 {
 
 	radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
 	    TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
 }
 
 static __inline vm_pindex_t
 pmap_l3e_pindex(vm_offset_t va)
 {
 	return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT);
 }
 
 static __inline vm_pindex_t
 pmap_pml3e_index(vm_offset_t va)
 {
 
 	return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK);
 }
 
 static __inline vm_pindex_t
 pmap_pml2e_index(vm_offset_t va)
 {
 	return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK);
 }
 
 static __inline vm_pindex_t
 pmap_pml1e_index(vm_offset_t va)
 {
 	return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT);
 }
 
 /* Return various clipped indexes for a given VA */
 static __inline vm_pindex_t
 pmap_pte_index(vm_offset_t va)
 {
 
 	return ((va >> PAGE_SHIFT) & RPTE_MASK);
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va)
 {
 	pt_entry_t *pte;
 	vm_paddr_t ptepa;
 
 	ptepa = (be64toh(*l3e) & NLB_MASK);
 	pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa);
 	return (&pte[pmap_pte_index(va)]);
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va)
 {
 	pt_entry_t *l3e;
 	vm_paddr_t l3pa;
 
 	l3pa = (be64toh(*l2e) & NLB_MASK);
 	l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa);
 	return (&l3e[pmap_pml3e_index(va)]);
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va)
 {
 	pt_entry_t *l2e;
 	vm_paddr_t l2pa;
 
 	l2pa = (be64toh(*l1e) & NLB_MASK);
 
 	l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa);
 	return (&l2e[pmap_pml2e_index(va)]);
 }
 
 static __inline pml1_entry_t *
 pmap_pml1e(pmap_t pmap, vm_offset_t va)
 {
 
 	return (&pmap->pm_pml1[pmap_pml1e_index(va)]);
 }
 
 static pt_entry_t *
 pmap_pml2e(pmap_t pmap, vm_offset_t va)
 {
 	pt_entry_t *l1e;
 
 	l1e = pmap_pml1e(pmap, va);
 	if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0)
 		return (NULL);
 	return (pmap_l1e_to_l2e(l1e, va));
 }
 
 static __inline pt_entry_t *
 pmap_pml3e(pmap_t pmap, vm_offset_t va)
 {
 	pt_entry_t *l2e;
 
 	l2e = pmap_pml2e(pmap, va);
 	if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0)
 		return (NULL);
 	return (pmap_l2e_to_l3e(l2e, va));
 }
 
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pt_entry_t *l3e;
 
 	l3e = pmap_pml3e(pmap, va);
 	if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
 		return (NULL);
 	return (pmap_l3e_to_pte(l3e, va));
 }
 
 int nkpt = 64;
 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
     "Number of kernel page table pages allocated on bootup");
 
 vm_paddr_t dmaplimit;
 
 SYSCTL_DECL(_vm_pmap);
 
 #ifdef INVARIANTS
 #define VERBOSE_PMAP 0
 #define VERBOSE_PROTECT 0
 static int pmap_logging;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN,
     &pmap_logging, 0, "verbose debug logging");
 #endif
 
 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
 
 //static vm_paddr_t	KERNend;	/* phys addr of end of bootstrap data */
 
 static vm_offset_t qframe = 0;
 static struct mtx qframe_mtx;
 
 void mmu_radix_activate(struct thread *);
 void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int);
 void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
     vm_size_t);
 void mmu_radix_clear_modify(vm_page_t);
 void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t);
 int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *);
 int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t);
 void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
 	vm_prot_t);
 void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
 vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va);
 vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
 void mmu_radix_kenter(vm_offset_t, vm_paddr_t);
 vm_paddr_t mmu_radix_kextract(vm_offset_t);
 void mmu_radix_kremove(vm_offset_t);
 boolean_t mmu_radix_is_modified(vm_page_t);
 boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t);
 boolean_t mmu_radix_is_referenced(vm_page_t);
 void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t,
 	vm_pindex_t, vm_size_t);
 boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t);
 void mmu_radix_page_init(vm_page_t);
 boolean_t mmu_radix_page_is_mapped(vm_page_t m);
 void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t);
 int mmu_radix_page_wired_mappings(vm_page_t);
 int mmu_radix_pinit(pmap_t);
 void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
 bool mmu_radix_ps_enabled(pmap_t);
 void mmu_radix_qenter(vm_offset_t, vm_page_t *, int);
 void mmu_radix_qremove(vm_offset_t, int);
 vm_offset_t mmu_radix_quick_enter_page(vm_page_t);
 void mmu_radix_quick_remove_page(vm_offset_t);
 boolean_t mmu_radix_ts_referenced(vm_page_t);
 void mmu_radix_release(pmap_t);
 void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t);
 void mmu_radix_remove_all(vm_page_t);
 void mmu_radix_remove_pages(pmap_t);
 void mmu_radix_remove_write(vm_page_t);
 void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t);
 void mmu_radix_zero_page(vm_page_t);
 void mmu_radix_zero_page_area(vm_page_t, int, int);
 int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t);
 void mmu_radix_page_array_startup(long pages);
 
 #include "mmu_oea64.h"
 
 /*
  * Kernel MMU interface
  */
 
 static void	mmu_radix_bootstrap(vm_offset_t, vm_offset_t);
 
 static void mmu_radix_copy_page(vm_page_t, vm_page_t);
 static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
 static void mmu_radix_growkernel(vm_offset_t);
 static void mmu_radix_init(void);
 static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
 static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
 static void mmu_radix_pinit0(pmap_t);
 
 static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t);
 static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
 static void mmu_radix_unmapdev(vm_offset_t, vm_size_t);
 static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
 static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t);
 static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va);
 static void mmu_radix_scan_init(void);
 static void	mmu_radix_cpu_bootstrap(int ap);
 static void	mmu_radix_tlbie_all(void);
 
 static struct pmap_funcs mmu_radix_methods = {
 	.bootstrap = mmu_radix_bootstrap,
 	.copy_page = mmu_radix_copy_page,
 	.copy_pages = mmu_radix_copy_pages,
 	.cpu_bootstrap = mmu_radix_cpu_bootstrap,
 	.growkernel = mmu_radix_growkernel,
 	.init = mmu_radix_init,
 	.map =      		mmu_radix_map,
 	.mincore =      	mmu_radix_mincore,
 	.pinit = mmu_radix_pinit,
 	.pinit0 = mmu_radix_pinit0,
 
 	.mapdev = mmu_radix_mapdev,
 	.mapdev_attr = mmu_radix_mapdev_attr,
 	.unmapdev = mmu_radix_unmapdev,
 	.kenter_attr = mmu_radix_kenter_attr,
 	.dev_direct_mapped = mmu_radix_dev_direct_mapped,
 	.dumpsys_pa_init = mmu_radix_scan_init,
 	.dumpsys_map_chunk = mmu_radix_dumpsys_map,
 	.page_is_mapped = mmu_radix_page_is_mapped,
 	.ps_enabled = mmu_radix_ps_enabled,
 	.align_superpage = mmu_radix_align_superpage,
 	.object_init_pt = mmu_radix_object_init_pt,
 	.protect = mmu_radix_protect,
 	/* pmap dispatcher interface */
 	.clear_modify = mmu_radix_clear_modify,
 	.copy = mmu_radix_copy,
 	.enter = mmu_radix_enter,
 	.enter_object = mmu_radix_enter_object,
 	.enter_quick = mmu_radix_enter_quick,
 	.extract = mmu_radix_extract,
 	.extract_and_hold = mmu_radix_extract_and_hold,
 	.is_modified = mmu_radix_is_modified,
 	.is_prefaultable = mmu_radix_is_prefaultable,
 	.is_referenced = mmu_radix_is_referenced,
 	.ts_referenced = mmu_radix_ts_referenced,
 	.page_exists_quick = mmu_radix_page_exists_quick,
 	.page_init = mmu_radix_page_init,
 	.page_wired_mappings =  mmu_radix_page_wired_mappings,
 	.qenter = mmu_radix_qenter,
 	.qremove = mmu_radix_qremove,
 	.release = mmu_radix_release,
 	.remove = mmu_radix_remove,
 	.remove_all = mmu_radix_remove_all,
 	.remove_write = mmu_radix_remove_write,
 	.unwire = mmu_radix_unwire,
 	.zero_page = mmu_radix_zero_page,
 	.zero_page_area = mmu_radix_zero_page_area,
 	.activate = mmu_radix_activate,
 	.quick_enter_page =  mmu_radix_quick_enter_page,
 	.quick_remove_page =  mmu_radix_quick_remove_page,
 	.page_set_memattr = mmu_radix_page_set_memattr,
 	.page_array_startup =  mmu_radix_page_array_startup,
 
 	/* Internal interfaces */
 	.kenter = mmu_radix_kenter,
 	.kextract = mmu_radix_kextract,
 	.kremove = mmu_radix_kremove,
 	.change_attr = mmu_radix_change_attr,
 	.decode_kernel_ptr =  mmu_radix_decode_kernel_ptr,
 
 	.tlbie_all = mmu_radix_tlbie_all,
 };
 
 MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods);
 
 static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
 	struct rwlock **lockp);
 static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *);
 static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde,
     struct spglist *free);
 static bool	pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 	pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp);
 
 static bool	pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e,
 		    u_int flags, struct rwlock **lockp);
 #if VM_NRESERVLEVEL > 0
 static void	pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 	struct rwlock **lockp);
 #endif
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
 	vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate);
 
 static bool	pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
 	vm_prot_t prot, struct rwlock **lockp);
 static int	pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde,
 	u_int flags, vm_page_t m, struct rwlock **lockp);
 
 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 static void free_pv_chunk(struct pv_chunk *pc);
 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp);
 static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va,
 	struct rwlock **lockp);
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
 	struct rwlock **lockp);
 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
 static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free);
 
 static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start);
 static void pmap_invalidate_all(pmap_t pmap);
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush);
 
 /*
  * Internal flags for pmap_enter()'s helper functions.
  */
 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
 
 #define UNIMPLEMENTED() panic("%s not implemented", __func__)
 #define UNTESTED() panic("%s not yet tested", __func__)
 
 /* Number of supported PID bits */
 static unsigned int isa3_pid_bits;
 
 /* PID to start allocating from */
 static unsigned int isa3_base_pid;
 
 #define PROCTAB_SIZE_SHIFT	(isa3_pid_bits + 4)
 #define PROCTAB_ENTRIES	(1ul << isa3_pid_bits)
 
 /*
  * Map of physical memory regions.
  */
 static struct	mem_region *regions, *pregions;
 static struct	numa_mem_region *numa_pregions;
 static u_int	phys_avail_count;
 static int	regions_sz, pregions_sz, numa_pregions_sz;
 static struct pate *isa3_parttab;
 static struct prte *isa3_proctab;
 static vmem_t *asid_arena;
 
 extern void bs_remap_earlyboot(void);
 
 #define	RADIX_PGD_SIZE_SHIFT	16
 #define RADIX_PGD_SIZE	(1UL << RADIX_PGD_SIZE_SHIFT)
 
 #define	RADIX_PGD_INDEX_SHIFT	(RADIX_PGD_SIZE_SHIFT-3)
 #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t))
 #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t))
 
 #define	NUPML1E		(RADIX_PGD_SIZE/sizeof(uint64_t))	/* number of userland PML1 pages */
 #define	NUPDPE		(NUPML1E * NL2EPG)/* number of userland PDP pages */
 #define	NUPDE		(NUPDPE * NL3EPG)	/* number of userland PD entries */
 
 /* POWER9 only permits a 64k partition table size. */
 #define	PARTTAB_SIZE_SHIFT	16
 #define PARTTAB_SIZE	(1UL << PARTTAB_SIZE_SHIFT)
 
 #define PARTTAB_HR		(1UL << 63) /* host uses radix */
 #define PARTTAB_GR		(1UL << 63) /* guest uses radix must match host */
 
 /* TLB flush actions. Used as argument to tlbiel_flush() */
 enum {
 	TLB_INVAL_SCOPE_LPID = 2,	/* invalidate TLBs for current LPID */
 	TLB_INVAL_SCOPE_GLOBAL = 3,	/* invalidate all TLBs */
 };
 
 #define	NPV_LIST_LOCKS	MAXCPU
 static int pmap_initialized;
 static vm_paddr_t proctab0pa;
 static vm_paddr_t parttab_phys;
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 
 /*
  * Data for the pv entry allocation mechanism.
  * Updates to pv_invl_gen are protected by the pv_list_locks[]
  * elements, but reads are not.
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx __exclusive_cache_line pv_chunks_mutex;
 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 static struct md_page pv_dummy;
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pa_radix_index(pa)	((pa) >> L3_PAGE_SIZE_SHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_radix_index(pa)])
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
 	struct rwlock *_new_lock;			\
 							\
 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
 	if (_new_lock != *_lockp) {			\
 		if (*_lockp != NULL)			\
 			rw_wunlock(*_lockp);		\
 		*_lockp = _new_lock;			\
 		rw_wlock(*_lockp);			\
 	}						\
 } while (0)
 
 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 
 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
 	struct rwlock **_lockp = (lockp);		\
 							\
 	if (*_lockp != NULL) {				\
 		rw_wunlock(*_lockp);			\
 		*_lockp = NULL;				\
 	}						\
 } while (0)
 
 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
 	PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 
 /*
  * We support 52 bits, hence:
  * bits 52 - 31 = 21, 0b10101
  * RTS encoding details
  * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
  * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
  */
 #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5))
 
 static int powernv_enabled = 1;
 
 static __always_inline void
 tlbiel_radix_set_isa300(uint32_t set, uint32_t is,
 	uint32_t pid, uint32_t ric, uint32_t prs)
 {
 	uint64_t rb;
 	uint64_t rs;
 
 	rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53);
 	rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31);
 
 	__asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
 		     : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
 		     : "memory");
 }
 
 static void
 tlbiel_flush_isa3(uint32_t num_sets, uint32_t is)
 {
 	uint32_t set;
 
 	__asm __volatile("ptesync": : :"memory");
 
 	/*
 	 * Flush the first set of the TLB, and the entire Page Walk Cache
 	 * and partition table entries. Then flush the remaining sets of the
 	 * TLB.
 	 */
 	if (is == TLB_INVAL_SCOPE_GLOBAL) {
 		tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
 		for (set = 1; set < num_sets; set++)
 			tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
 	}
 
 	/* Do the same for process scoped entries. */
 	tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
 	for (set = 1; set < num_sets; set++)
 		tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
 
 	__asm __volatile("ptesync": : :"memory");
 }
 
 static void
 mmu_radix_tlbiel_flush(int scope)
 {
 	MPASS(scope == TLB_INVAL_SCOPE_LPID ||
 		  scope == TLB_INVAL_SCOPE_GLOBAL);
 
 	tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, scope);
 	__asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
 }
 
 static void
 mmu_radix_tlbie_all()
 {
 	if (powernv_enabled)
 		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
 	else
 		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
 }
 
 static void
 mmu_radix_init_amor(void)
 {
 	/*
 	* In HV mode, we init AMOR (Authority Mask Override Register) so that
 	* the hypervisor and guest can setup IAMR (Instruction Authority Mask
 	* Register), enable key 0 and set it to 1.
 	*
 	* AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
 	*/
 	mtspr(SPR_AMOR, (3ul << 62));
 }
 
 static void
 mmu_radix_init_iamr(void)
 {
 	/*
 	 * Radix always uses key0 of the IAMR to determine if an access is
 	 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
 	 * fetch.
 	 */
 	mtspr(SPR_IAMR, (1ul << 62));
 }
 
 static void
 mmu_radix_pid_set(pmap_t pmap)
 {
 
 	mtspr(SPR_PID, pmap->pm_pid);
 	isync();
 }
 
 /* Quick sort callout for comparing physical addresses. */
 static int
 pa_cmp(const void *a, const void *b)
 {
 	const vm_paddr_t *pa = a, *pb = b;
 
 	if (*pa < *pb)
 		return (-1);
 	else if (*pa > *pb)
 		return (1);
 	else
 		return (0);
 }
 
 #define	pte_load_store(ptep, pte)	atomic_swap_long(ptep, pte)
 #define	pte_load_clear(ptep)		atomic_swap_long(ptep, 0)
 #define	pte_store(ptep, pte) do {	   \
 	MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X));	\
 	*(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \
 } while (0)
 /*
  * NB: should only be used for adding directories - not for direct mappings
  */
 #define	pde_store(ptep, pa) do {				\
 	*(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \
 } while (0)
 
 #define	pte_clear(ptep) do {					\
 		*(u_long *)(ptep) = (u_long)(0);		\
 } while (0)
 
 #define	PMAP_PDE_SUPERPAGE	(1 << 8)	/* supports 2MB superpages */
 
 /*
  * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
  * (PTE) page mappings have identical settings for the following fields:
  */
 #define	PG_PTE_PROMOTE	(PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \
 	    PG_M | PG_A | RPTE_EAA_MASK | PG_V)
 
 static __inline void
 pmap_resident_count_inc(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pmap->pm_stats.resident_count += count;
 }
 
 static __inline void
 pmap_resident_count_dec(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(pmap->pm_stats.resident_count >= count,
 	    ("pmap %p resident count underflow %ld %d", pmap,
 	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count -= count;
 }
 
 static void
 pagezero(vm_offset_t va)
 {
 	va = trunc_page(va);
 
 	bzero((void *)va, PAGE_SIZE);
 }
 
 static uint64_t
 allocpages(int n)
 {
 	u_int64_t ret;
 
 	ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE);
 	for (int i = 0; i < n; i++)
 		pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE));
 	return (ret);
 }
 
 static pt_entry_t *
 kvtopte(vm_offset_t va)
 {
 	pt_entry_t *l3e;
 
 	l3e = pmap_pml3e(kernel_pmap, va);
 	if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
 		return (NULL);
 	return (pmap_l3e_to_pte(l3e, va));
 }
 
 void
 mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = kvtopte(va);
 	MPASS(pte != NULL);
 	*pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \
 	    RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A);
 }
 
 bool
 mmu_radix_ps_enabled(pmap_t pmap)
 {
 	return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
 }
 
 static pt_entry_t *
 pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e)
 {
 	pml3_entry_t *l3e;
 	pt_entry_t *pte;
 
 	va &= PG_PS_FRAME;
 	l3e = pmap_pml3e(pmap, va);
 	if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0)
 		return (NULL);
 
 	if (be64toh(*l3e) & RPTE_LEAF) {
 		*is_l3e = 1;
 		return (l3e);
 	}
 	*is_l3e = 0;
 	va &= PG_FRAME;
 	pte = pmap_l3e_to_pte(l3e, va);
 	if (pte == NULL || (be64toh(*pte) & PG_V) == 0)
 		return (NULL);
 	return (pte);
 }
 
 int
 pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags)
 {
 	pt_entry_t *pte;
 	pt_entry_t startpte, origpte, newpte;
 	vm_page_t m;
 	int is_l3e;
 
 	startpte = 0;
  retry:
 	if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL)
 		return (KERN_INVALID_ADDRESS);
 	origpte = newpte = be64toh(*pte);
 	if (startpte == 0) {
 		startpte = origpte;
 		if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) ||
 		    ((flags & VM_PROT_READ) && (startpte & PG_A))) {
 			pmap_invalidate_all(pmap);
 #ifdef INVARIANTS
 			if (VERBOSE_PMAP || pmap_logging)
 				printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n",
 				    __func__, pmap, va, flags, origpte);
 #endif
 			return (KERN_FAILURE);
 		}
 	}
 #ifdef INVARIANTS
 	if (VERBOSE_PMAP || pmap_logging)
 		printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va,
 		    flags, origpte);
 #endif
 	PMAP_LOCK(pmap);
 	if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL ||
 	    be64toh(*pte) != origpte) {
 		PMAP_UNLOCK(pmap);
 		return (KERN_FAILURE);
 	}
 	m = PHYS_TO_VM_PAGE(newpte & PG_FRAME);
 	MPASS(m != NULL);
 	switch (flags) {
 	case VM_PROT_READ:
 		if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0)
 			goto protfail;
 		newpte |= PG_A;
 		vm_page_aflag_set(m, PGA_REFERENCED);
 		break;
 	case VM_PROT_WRITE:
 		if ((newpte & RPTE_EAA_W) == 0)
 			goto protfail;
 		if (is_l3e)
 			goto protfail;
 		newpte |= PG_M;
 		vm_page_dirty(m);
 		break;
 	case VM_PROT_EXECUTE:
 		if ((newpte & RPTE_EAA_X) == 0)
 			goto protfail;
 		newpte |= PG_A;
 		vm_page_aflag_set(m, PGA_REFERENCED);
 		break;
 	}
 
 	if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
 		goto retry;
 	ptesync();
 	PMAP_UNLOCK(pmap);
 	if (startpte == newpte)
 		return (KERN_FAILURE);
 	return (0);
  protfail:
 	PMAP_UNLOCK(pmap);
 	return (KERN_PROTECTION_FAILURE);
 }
 
 /*
  * Returns TRUE if the given page is mapped individually or as part of
  * a 2mpage.  Otherwise, returns FALSE.
  */
 boolean_t
 mmu_radix_page_is_mapped(vm_page_t m)
 {
 	struct rwlock *lock;
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 static int
 pmap_cache_bits(vm_memattr_t ma)
 {
 	if (ma != VM_MEMATTR_DEFAULT) {
 		switch (ma) {
 		case VM_MEMATTR_UNCACHEABLE:
 			return (RPTE_ATTR_GUARDEDIO);
 		case VM_MEMATTR_CACHEABLE:
 			return (RPTE_ATTR_MEM);
 		case VM_MEMATTR_WRITE_BACK:
 		case VM_MEMATTR_PREFETCHABLE:
 		case VM_MEMATTR_WRITE_COMBINING:
 			return (RPTE_ATTR_UNGUARDEDIO);
 		}
 	}
 	return (0);
 }
 
 static void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t start)
 {
 	ptesync();
 	if (pmap == kernel_pmap)
 		radix_tlbie_invlpg_kernel_4k(start);
 	else
 		radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
 	ttusync();
 }
 
 static void
 pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start)
 {
 	ptesync();
 	if (pmap == kernel_pmap)
 		radix_tlbie_invlpg_kernel_2m(start);
 	else
 		radix_tlbie_invlpg_user_2m(pmap->pm_pid, start);
 	ttusync();
 }
 
 static void
 pmap_invalidate_pwc(pmap_t pmap)
 {
 	ptesync();
 	if (pmap == kernel_pmap)
 		radix_tlbie_invlpwc_kernel();
 	else
 		radix_tlbie_invlpwc_user(pmap->pm_pid);
 	ttusync();
 }
 
 static void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end)
 {
 	if (((start - end) >> PAGE_SHIFT) > 8) {
 		pmap_invalidate_all(pmap);
 		return;
 	}
 	ptesync();
 	if (pmap == kernel_pmap) {
 		while (start < end) {
 			radix_tlbie_invlpg_kernel_4k(start);
 			start += PAGE_SIZE;
 		}
 	} else {
 		while (start < end) {
 			radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
 			start += PAGE_SIZE;
 		}
 	}
 	ttusync();
 }
 
 static void
 pmap_invalidate_all(pmap_t pmap)
 {
 	ptesync();
 	if (pmap == kernel_pmap)
 		radix_tlbie_flush_kernel();
 	else
 		radix_tlbie_flush_user(pmap->pm_pid);
 	ttusync();
 }
 
 static void
 pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e)
 {
 
 	/*
 	 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
 	 * by a promotion that did not invalidate the 512 4KB page mappings
 	 * that might exist in the TLB.  Consequently, at this point, the TLB
 	 * may hold both 4KB and 2MB page mappings for the address range [va,
 	 * va + L3_PAGE_SIZE).  Therefore, the entire range must be invalidated here.
 	 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
 	 * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a
 	 * single INVLPG suffices to invalidate the 2MB page mapping from the
 	 * TLB.
 	 */
 	ptesync();
 	if ((l3e & PG_PROMOTED) != 0)
 		pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1);
 	else
 		pmap_invalidate_page_2m(pmap, va);
 
 	pmap_invalidate_pwc(pmap);
 }
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0x3ffffffffffffffful
 
 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 };
 
 /*
  * Ensure that the number of spare PV entries in the specified pmap meets or
  * exceeds the given count, "needed".
  *
  * The given PV list lock may be released.
  */
 static void
 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 {
 	struct pch new_tail;
 	struct pv_chunk *pc;
 	vm_page_t m;
 	int avail, free;
 	bool reclaimed;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 
 	/*
 	 * Newly allocated PV chunks must be stored in a private list until
 	 * the required number of PV chunks have been allocated.  Otherwise,
 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
 	 * contrast, these chunks must be added to the pmap upon allocation.
 	 */
 	TAILQ_INIT(&new_tail);
 retry:
 	avail = 0;
 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 		//		if ((cpu_feature2 & CPUID2_POPCNT) == 0)
 		bit_count((bitstr_t *)pc->pc_map, 0,
 				  sizeof(pc->pc_map) * NBBY, &free);
 #if 0
 		free = popcnt_pc_map_pq(pc->pc_map);
 #endif
 		if (free == 0)
 			break;
 		avail += free;
 		if (avail >= needed)
 			break;
 	}
 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
 		if (m == NULL) {
 			m = reclaim_pv_chunk(pmap, lockp);
 			if (m == NULL)
 				goto retry;
 			reclaimed = true;
 		}
 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 		dump_add_page(m->phys_addr);
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
 		pc->pc_map[0] = PC_FREE0;
 		pc->pc_map[1] = PC_FREE1;
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 
 		/*
 		 * The reclaim might have freed a chunk from the current pmap.
 		 * If that chunk contained available entries, we need to
 		 * re-count the number of available entries.
 		 */
 		if (reclaimed)
 			goto retry;
 	}
 	if (!TAILQ_EMPTY(&new_tail)) {
 		mtx_lock(&pv_chunks_mutex);
 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 	}
 }
 
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
  * 2MB page mappings.
  */
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
 #ifdef INVARIANTS
 		if (PV_PMAP(pv) == NULL) {
 			printf("corrupted pv_chunk/pv %p\n", pv);
 			printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":");
 		}
 		MPASS(PV_PMAP(pv) != NULL);
 		MPASS(pv->pv_va != 0);
 #endif
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
 			pvh->pv_gen++;
 			break;
 		}
 	}
 	return (pv);
 }
 
 /*
  * After demotion from a 2MB page mapping to 512 4KB page mappings,
  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  * entries for each of the 4KB page mappings.
  */
 static void
 pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 	int bit, field;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((pa & L3_PAGE_MASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
 	 * page's pv list.  Once this transfer begins, the pv list lock
 	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
 
 	m->md.pv_gen++;
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
 	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 	va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
 	for (;;) {
 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0
 		    , ("pmap_pv_demote_pde: missing spare"));
 		for (field = 0; field < _NPCM; field++) {
 			while (pc->pc_map[field]) {
 				bit = cnttzd(pc->pc_map[field]);
 				pc->pc_map[field] &= ~(1ul << bit);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va += PAGE_SIZE;
 				pv->pv_va = va;
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_pde: page %p is not managed", m));
 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
 
 				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
 		}
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 out:
 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 }
 
 static void
 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap)
 {
 
 	if (pmap == NULL)
 		return;
 	pmap_invalidate_all(pmap);
 	if (pmap != locked_pmap)
 		PMAP_UNLOCK(pmap);
 }
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  *
  * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
 static int active_reclaims = 0;
 static vm_page_t
 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
 	struct md_page *pvh;
 	pml3_entry_t *l3e;
 	pmap_t next_pmap, pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint64_t inuse;
 	int bit, field, freed;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 	pmap = NULL;
 	m_pc = NULL;
 	SLIST_INIT(&free);
 	bzero(&pc_marker_b, sizeof(pc_marker_b));
 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
 	pc_marker = (struct pv_chunk *)&pc_marker_b;
 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
 
 	mtx_lock(&pv_chunks_mutex);
 	active_reclaims++;
 	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
 	    SLIST_EMPTY(&free)) {
 		next_pmap = pc->pc_pmap;
 		if (next_pmap == NULL) {
 			/*
 			 * The next chunk is a marker.  However, it is
 			 * not our marker, so active_reclaims must be
 			 * > 1.  Consequently, the next_chunk code
 			 * will not rotate the pv_chunks list.
 			 */
 			goto next_chunk;
 		}
 		mtx_unlock(&pv_chunks_mutex);
 
 		/*
 		 * A pv_chunk can only be removed from the pc_lru list
 		 * when both pc_chunks_mutex is owned and the
 		 * corresponding pmap is locked.
 		 */
 		if (pmap != next_pmap) {
 			reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
 			pmap = next_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap) {
 				RELEASE_PV_LIST_LOCK(lockp);
 				PMAP_LOCK(pmap);
 				mtx_lock(&pv_chunks_mutex);
 				continue;
 			} else if (pmap != locked_pmap) {
 				if (PMAP_TRYLOCK(pmap)) {
 					mtx_lock(&pv_chunks_mutex);
 					continue;
 				} else {
 					pmap = NULL; /* pmap is not locked */
 					mtx_lock(&pv_chunks_mutex);
 					pc = TAILQ_NEXT(pc_marker, pc_lru);
 					if (pc == NULL ||
 					    pc->pc_pmap != next_pmap)
 						continue;
 					goto next_chunk;
 				}
 			}
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = cnttzd(inuse);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va = pv->pv_va;
 				l3e = pmap_pml3e(pmap, va);
 				if ((be64toh(*l3e) & RPTE_LEAF) != 0)
 					continue;
 				pte = pmap_l3e_to_pte(l3e, va);
 				if ((be64toh(*pte) & PG_W) != 0)
 					continue;
 				tpte = be64toh(pte_load_clear(pte));
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 					vm_page_dirty(m);
 				if ((tpte & PG_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
 
 				m->md.pv_gen++;
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, be64toh(*l3e), &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			mtx_lock(&pv_chunks_mutex);
 			goto next_chunk;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap_resident_count_dec(pmap, freed);
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) {
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 			dump_drop_page(m_pc->phys_addr);
 			mtx_lock(&pv_chunks_mutex);
 			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 			break;
 		}
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		mtx_lock(&pv_chunks_mutex);
 		/* One freed pv entry in locked_pmap is sufficient. */
 		if (pmap == locked_pmap)
 			break;
 next_chunk:
 		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
 		if (active_reclaims == 1 && pmap != NULL) {
 			/*
 			 * Rotate the pv chunks list so that we do not
 			 * scan the same pv chunks that could not be
 			 * freed (because they contained a wired
 			 * and/or superpage mapping) on every
 			 * invocation of reclaim_pv_chunk().
 			 */
 			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
 				MPASS(pc->pc_pmap != NULL);
 				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 			}
 		}
 	}
 	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
 	active_reclaims--;
 	mtx_unlock(&pv_chunks_mutex);
 	reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->ref_count = 1;
 	}
 	vm_page_free_pages_toq(&free, true);
 	return (m_pc);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 #ifdef VERBOSE_PV
 	if (pmap != kernel_pmap)
 		printf("%s(%p, %p)\n", __func__, pmap, pv);
 #endif
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		}
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	mtx_lock(&pv_chunks_mutex);
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 /*
  * Returns a new PV entry, allocating a new PV chunk from the system when
  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
  * returned.
  *
  * The given PV list lock may be released.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = cnttzd(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			MPASS(PV_PMAP(pv) != NULL);
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = reclaim_pv_chunk(pmap, lockp);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	mtx_lock(&pv_chunks_mutex);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	MPASS(PV_PMAP(pv) != NULL);
 	return (pv);
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
  * replace the many pv entries for the 4KB page mappings by a single pv entry
  * for the 2MB page mapping.
  */
 static void
 pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	KASSERT((pa & L3_PAGE_MASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 	 * a transfer avoids the possibility that get_pv_entry() calls
 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 	 * mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
 	pvh->pv_gen++;
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  * First find and then destroy the pv entry for the specified pmap and virtual
  * address.  This operation can be performed on pv lists for either 4KB or 2MB
  * page mappings.
  */
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Conditionally create the PV entry for a 4KB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
 		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX];
 #ifdef INVARIANTS
 static void
 validate_addr(vm_paddr_t addr, vm_size_t size)
 {
 	vm_paddr_t end = addr + size;
 	bool found = false;
 
 	for (int i = 0; i < 2 * phys_avail_count; i += 2) {
 		if (addr >= phys_avail_debug[i] &&
 			end <= phys_avail_debug[i + 1]) {
 			found = true;
 			break;
 		}
 	}
 	KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array",
 					addr, end));
 }
 #else
 static void validate_addr(vm_paddr_t addr, vm_size_t size) {}
 #endif
 #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A)
 
 static vm_paddr_t
 alloc_pt_page(void)
 {
 	vm_paddr_t page;
 
 	page = allocpages(1);
 	pagezero(PHYS_TO_DMAP(page));
 	return (page);
 }
 
 static void
 mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end)
 {
 	pt_entry_t *pte, pteval;
 	vm_paddr_t page;
 
 	if (bootverbose)
 		printf("%s %lx -> %lx\n", __func__, start, end);
 	while (start < end) {
 		pteval = start | DMAP_PAGE_BITS;
 		pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start));
 		if ((be64toh(*pte) & RPTE_VALID) == 0) {
 			page = alloc_pt_page();
 			pde_store(pte, page);
 		}
 		pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start));
 		if ((start & L2_PAGE_MASK) == 0 &&
 			end - start >= L2_PAGE_SIZE) {
 			start += L2_PAGE_SIZE;
 			goto done;
 		} else if ((be64toh(*pte) & RPTE_VALID) == 0) {
 			page = alloc_pt_page();
 			pde_store(pte, page);
 		}
 
 		pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start));
 		if ((start & L3_PAGE_MASK) == 0 &&
 			end - start >= L3_PAGE_SIZE) {
 			start += L3_PAGE_SIZE;
 			goto done;
 		} else if ((be64toh(*pte) & RPTE_VALID) == 0) {
 			page = alloc_pt_page();
 			pde_store(pte, page);
 		}
 		pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start));
 		start += PAGE_SIZE;
 	done:
 		pte_store(pte, pteval);
 	}
 }
 
 static void
 mmu_radix_dmap_populate(vm_size_t hwphyssz)
 {
 	vm_paddr_t start, end;
 
 	for (int i = 0; i < pregions_sz; i++) {
 		start = pregions[i].mr_start;
 		end = start + pregions[i].mr_size;
 		if (hwphyssz && start >= hwphyssz)
 			break;
 		if (hwphyssz && hwphyssz < end)
 			end = hwphyssz;
 		mmu_radix_dmap_range(start, end);
 	}
 }
 
 static void
 mmu_radix_setup_pagetables(vm_size_t hwphyssz)
 {
 	vm_paddr_t ptpages, pages;
 	pt_entry_t *pte;
 	vm_paddr_t l1phys;
 
 	bzero(kernel_pmap, sizeof(struct pmap));
 	PMAP_LOCK_INIT(kernel_pmap);
 
 	ptpages = allocpages(3);
 	l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE);
 	validate_addr(l1phys, RADIX_PGD_SIZE);
 	if (bootverbose)
 		printf("l1phys=%lx\n", l1phys);
 	MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0);
 	for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++)
 		pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE));
 	kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys);
 
 	mmu_radix_dmap_populate(hwphyssz);
 
 	/*
 	 * Create page tables for first 128MB of KVA
 	 */
 	pages = ptpages;
 	pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS);
 	*pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
 	pages += PAGE_SIZE;
 	pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS);
 	*pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
 	pages += PAGE_SIZE;
 	pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS);
 	/*
 	 * the kernel page table pages need to be preserved in
 	 * phys_avail and not overlap with previous  allocations
 	 */
 	pages = allocpages(nkpt);
 	if (bootverbose) {
 		printf("phys_avail after dmap populate and nkpt allocation\n");
 		for (int j = 0; j < 2 * phys_avail_count; j+=2)
 			printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
 				   j, phys_avail[j], j + 1, phys_avail[j + 1]);
 	}
 	KPTphys = pages;
 	for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE)
 		*pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
 	kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE;
 	if (bootverbose)
 		printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1);
 	/*
 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
 	 * preallocated kernel page table pages so that vm_page structures
 	 * representing these pages will be created.  The vm_page structures
 	 * are required for promotion of the corresponding kernel virtual
 	 * addresses to superpage mappings.
 	 */
 	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 }
 
 static void
 mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end)
 {
 	vm_paddr_t	kpstart, kpend;
 	vm_size_t	physsz, hwphyssz;
 	//uint64_t	l2virt;
 	int		rm_pavail, proctab_size;
 	int		i, j;
 
 	kpstart = start & ~DMAP_BASE_ADDRESS;
 	kpend = end & ~DMAP_BASE_ADDRESS;
 
 	/* Get physical memory regions from firmware */
 	mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
 	CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory");
 
 	if (2 * VM_PHYSSEG_MAX < regions_sz)
 		panic("mmu_radix_early_bootstrap: phys_avail too small");
 
 	if (bootverbose)
 		for (int i = 0; i < regions_sz; i++)
 			printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n",
 			    i, regions[i].mr_start, i, regions[i].mr_size);
 	/*
 	 * XXX workaround a simulator bug
 	 */
 	for (int i = 0; i < regions_sz; i++)
 		if (regions[i].mr_start & PAGE_MASK) {
 			regions[i].mr_start += PAGE_MASK;
 			regions[i].mr_start &= ~PAGE_MASK;
 			regions[i].mr_size &= ~PAGE_MASK;
 		}
 	if (bootverbose)
 		for (int i = 0; i < pregions_sz; i++)
 			printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n",
 			    i, pregions[i].mr_start, i, pregions[i].mr_size);
 
 	phys_avail_count = 0;
 	physsz = 0;
 	hwphyssz = 0;
 	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
 	for (i = 0, j = 0; i < regions_sz; i++) {
 		if (bootverbose)
 			printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n",
 			    i, regions[i].mr_start, i, regions[i].mr_size);
 
 		if (regions[i].mr_size < PAGE_SIZE)
 			continue;
 
 		if (hwphyssz != 0 &&
 		    (physsz + regions[i].mr_size) >= hwphyssz) {
 			if (physsz < hwphyssz) {
 				phys_avail[j] = regions[i].mr_start;
 				phys_avail[j + 1] = regions[i].mr_start +
 				    (hwphyssz - physsz);
 				physsz = hwphyssz;
 				phys_avail_count++;
 				dump_avail[j] = phys_avail[j];
 				dump_avail[j + 1] = phys_avail[j + 1];
 			}
 			break;
 		}
 		phys_avail[j] = regions[i].mr_start;
 		phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
 		dump_avail[j] = phys_avail[j];
 		dump_avail[j + 1] = phys_avail[j + 1];
 
 		phys_avail_count++;
 		physsz += regions[i].mr_size;
 		j += 2;
 	}
 
 	/* Check for overlap with the kernel and exception vectors */
 	rm_pavail = 0;
 	for (j = 0; j < 2 * phys_avail_count; j+=2) {
 		if (phys_avail[j] < EXC_LAST)
 			phys_avail[j] += EXC_LAST;
 
 		if (phys_avail[j] >= kpstart &&
 		    phys_avail[j + 1] <= kpend) {
 			phys_avail[j] = phys_avail[j + 1] = ~0;
 			rm_pavail++;
 			continue;
 		}
 
 		if (kpstart >= phys_avail[j] &&
 		    kpstart < phys_avail[j + 1]) {
 			if (kpend < phys_avail[j + 1]) {
 				phys_avail[2 * phys_avail_count] =
 				    (kpend & ~PAGE_MASK) + PAGE_SIZE;
 				phys_avail[2 * phys_avail_count + 1] =
 				    phys_avail[j + 1];
 				phys_avail_count++;
 			}
 
 			phys_avail[j + 1] = kpstart & ~PAGE_MASK;
 		}
 
 		if (kpend >= phys_avail[j] &&
 		    kpend < phys_avail[j + 1]) {
 			if (kpstart > phys_avail[j]) {
 				phys_avail[2 * phys_avail_count] = phys_avail[j];
 				phys_avail[2 * phys_avail_count + 1] =
 				    kpstart & ~PAGE_MASK;
 				phys_avail_count++;
 			}
 
 			phys_avail[j] = (kpend & ~PAGE_MASK) +
 			    PAGE_SIZE;
 		}
 	}
 	qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp);
 	for (i = 0; i < 2 * phys_avail_count; i++)
 		phys_avail_debug[i] = phys_avail[i];
 
 	/* Remove physical available regions marked for removal (~0) */
 	if (rm_pavail) {
 		phys_avail_count -= rm_pavail;
 		for (i = 2 * phys_avail_count;
 		     i < 2*(phys_avail_count + rm_pavail); i+=2)
 			phys_avail[i] = phys_avail[i + 1] = 0;
 	}
 	if (bootverbose) {
 		printf("phys_avail ranges after filtering:\n");
 		for (j = 0; j < 2 * phys_avail_count; j+=2)
 			printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
 				   j, phys_avail[j], j + 1, phys_avail[j + 1]);
 	}
 	physmem = btoc(physsz);
 
 	/* XXX assume we're running non-virtualized and
 	 * we don't support BHYVE
 	 */
 	if (isa3_pid_bits == 0)
 		isa3_pid_bits = 20;
 	if (powernv_enabled) {
 		parttab_phys =
 		    moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE);
 		validate_addr(parttab_phys, PARTTAB_SIZE);
 		for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++)
 			pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE));
 
 	}
 	proctab_size = 1UL << PROCTAB_SIZE_SHIFT;
 	proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size);
 	validate_addr(proctab0pa, proctab_size);
 	for (int i = 0; i < proctab_size/PAGE_SIZE; i++)
 		pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE));
 
 	mmu_radix_setup_pagetables(hwphyssz);
 }
 
 static void
 mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end)
 {
 	int		i;
 	vm_paddr_t	pa;
 	void		*dpcpu;
 	vm_offset_t va;
 
 	/*
 	 * Set up the Open Firmware pmap and add its mappings if not in real
 	 * mode.
 	 */
 	if (bootverbose)
 		printf("%s enter\n", __func__);
 
 	/*
 	 * Calculate the last available physical address, and reserve the
 	 * vm_page_array (upper bound).
 	 */
 	Maxmem = 0;
 	for (i = 0; phys_avail[i + 2] != 0; i += 2)
 		Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
 
 	/*
 	 * Remap any early IO mappings (console framebuffer, etc.)
 	 */
 	bs_remap_earlyboot();
 
 	/*
 	 * Allocate a kernel stack with a guard page for thread0 and map it
 	 * into the kernel page map.
 	 */
 	pa = allocpages(kstack_pages);
 	va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
 	virtual_avail = va + kstack_pages * PAGE_SIZE;
 	CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
 	thread0.td_kstack = va;
 	for (i = 0; i < kstack_pages; i++) {
 		mmu_radix_kenter(va, pa);
 		pa += PAGE_SIZE;
 		va += PAGE_SIZE;
 	}
 	thread0.td_kstack_pages = kstack_pages;
 
 	/*
 	 * Allocate virtual address space for the message buffer.
 	 */
 	pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK)  >> PAGE_SHIFT);
 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa);
 
 	/*
 	 * Allocate virtual address space for the dynamic percpu area.
 	 */
 	pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT);
 	dpcpu = (void *)PHYS_TO_DMAP(pa);
 	dpcpu_init(dpcpu, curcpu);
 
 	crashdumpmap = (caddr_t)virtual_avail;
 	virtual_avail += MAXDUMPPGS * PAGE_SIZE;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 }
 
 static void
 mmu_parttab_init(void)
 {
 	uint64_t ptcr;
 
 	isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys);
 
 	if (bootverbose)
 		printf("%s parttab: %p\n", __func__, isa3_parttab);
 	ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
 	if (bootverbose)
 		printf("setting ptcr %lx\n", ptcr);
 	mtspr(SPR_PTCR, ptcr);
 }
 
 static void
 mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab)
 {
 	uint64_t prev;
 
 	if (bootverbose)
 		printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab,
 			   lpid, pagetab, proctab);
 	prev = be64toh(isa3_parttab[lpid].pagetab);
 	isa3_parttab[lpid].pagetab = htobe64(pagetab);
 	isa3_parttab[lpid].proctab = htobe64(proctab);
 
 	if (prev & PARTTAB_HR) {
 		__asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
 			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 		__asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
 			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 	} else {
 		__asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
 			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 	}
 	ttusync();
 }
 
 static void
 mmu_radix_parttab_init(void)
 {
 	uint64_t pagetab;
 
 	mmu_parttab_init();
 	pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \
 		         RADIX_PGD_INDEX_SHIFT | PARTTAB_HR;
 	mmu_parttab_update(0, pagetab, 0);
 }
 
 static void
 mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size)
 {
 	uint64_t pagetab, proctab;
 
 	pagetab = be64toh(isa3_parttab[0].pagetab);
 	proctab = proctabpa | table_size | PARTTAB_GR;
 	mmu_parttab_update(0, pagetab, proctab);
 }
 
 static void
 mmu_radix_proctab_init(void)
 {
 
 	isa3_base_pid = 1;
 
 	isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa);
 	isa3_proctab->proctab0 =
 	    htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) |
 		RADIX_PGD_INDEX_SHIFT);
 
 	if (powernv_enabled) {
 		mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12);
 		__asm __volatile("ptesync" : : : "memory");
 		__asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
 			     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
 		__asm __volatile("eieio; tlbsync; ptesync" : : : "memory");
 #ifdef PSERIES
 	} else {
 		int64_t rc;
 
 		rc = phyp_hcall(H_REGISTER_PROC_TBL,
 		    PROC_TABLE_NEW | PROC_TABLE_RADIX | PROC_TABLE_GTSE,
 		    proctab0pa, 0, PROCTAB_SIZE_SHIFT - 12);
 		if (rc != H_SUCCESS)
 			panic("mmu_radix_proctab_init: "
 				"failed to register process table: rc=%jd",
 				(intmax_t)rc);
 #endif
 	}
 
 	if (bootverbose)
 		printf("process table %p and kernel radix PDE: %p\n",
 			   isa3_proctab, kernel_pmap->pm_pml1);
 	mtmsr(mfmsr() | PSL_DR );
 	mtmsr(mfmsr() &  ~PSL_DR);
 	kernel_pmap->pm_pid = isa3_base_pid;
 	isa3_base_pid++;
 }
 
 void
 mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     int advice)
 {
 	struct rwlock *lock;
 	pml1_entry_t *l1e;
 	pml2_entry_t *l2e;
 	pml3_entry_t oldl3e, *l3e;
 	pt_entry_t *pte;
 	vm_offset_t va, va_next;
 	vm_page_t m;
 	bool anychanged;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 	anychanged = false;
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l1e = pmap_pml1e(pmap, sva);
 		if ((be64toh(*l1e) & PG_V) == 0) {
 			va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		l2e = pmap_l1e_to_l2e(l1e, sva);
 		if ((be64toh(*l2e) & PG_V) == 0) {
 			va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 		if (va_next < sva)
 			va_next = eva;
 		l3e = pmap_l2e_to_l3e(l2e, sva);
 		oldl3e = be64toh(*l3e);
 		if ((oldl3e & PG_V) == 0)
 			continue;
 		else if ((oldl3e & RPTE_LEAF) != 0) {
 			if ((oldl3e & PG_MANAGED) == 0)
 				continue;
 			lock = NULL;
 			if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) {
 				if (lock != NULL)
 					rw_wunlock(lock);
 
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Choosing the last page
 			 * within the address range [sva, min(va_next, eva))
 			 * generally results in more repromotions.  Since the
 			 * underlying page table page is fully populated, this
 			 * removal never frees a page table page.
 			 */
 			if ((oldl3e & PG_W) == 0) {
 				va = eva;
 				if (va > va_next)
 					va = va_next;
 				va -= PAGE_SIZE;
 				KASSERT(va >= sva,
 				    ("mmu_radix_advise: no address gap"));
 				pte = pmap_l3e_to_pte(l3e, va);
 				KASSERT((be64toh(*pte) & PG_V) != 0,
 				    ("pmap_advise: invalid PTE"));
 				pmap_remove_pte(pmap, pte, va, be64toh(*l3e), NULL,
 				    &lock);
 				anychanged = true;
 			}
 			if (lock != NULL)
 				rw_wunlock(lock);
 		}
 		if (va_next > eva)
 			va_next = eva;
 		va = va_next;
 		for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next;
 			 pte++, sva += PAGE_SIZE) {
 			MPASS(pte == pmap_pte(pmap, sva));
 
 			if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 				goto maybe_invlrng;
 			else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				atomic_clear_long(pte, htobe64(PG_M | PG_A));
 			} else if ((be64toh(*pte) & PG_A) != 0)
 				atomic_clear_long(pte, htobe64(PG_A));
 			else
 				goto maybe_invlrng;
 			anychanged = true;
 			continue;
 maybe_invlrng:
 			if (va != va_next) {
 				anychanged = true;
 				va = va_next;
 			}
 		}
 		if (va != va_next)
 			anychanged = true;
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Routines used in machine-dependent code
  */
 static void
 mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end)
 {
 	uint64_t lpcr;
 
 	if (bootverbose)
 		printf("%s\n", __func__);
 	hw_direct_map = 1;
 	powernv_enabled = (mfmsr() & PSL_HV) ? 1 : 0;
 	mmu_radix_early_bootstrap(start, end);
 	if (bootverbose)
 		printf("early bootstrap complete\n");
 	if (powernv_enabled) {
 		lpcr = mfspr(SPR_LPCR);
 		mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 		mmu_radix_parttab_init();
 		mmu_radix_init_amor();
 		if (bootverbose)
 			printf("powernv init complete\n");
 	}
 	mmu_radix_init_iamr();
 	mmu_radix_proctab_init();
 	mmu_radix_pid_set(kernel_pmap);
 	if (powernv_enabled)
 		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
 	else
 		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
 
 	mmu_radix_late_bootstrap(start, end);
 	numa_mem_regions(&numa_pregions, &numa_pregions_sz);
 	if (bootverbose)
 		printf("%s done\n", __func__);
 	pmap_bootstrapped = 1;
 	dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE);
 	PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS);
 }
 
 static void
 mmu_radix_cpu_bootstrap(int ap)
 {
 	uint64_t lpcr;
 	uint64_t ptcr;
 
 	if (powernv_enabled) {
 		lpcr = mfspr(SPR_LPCR);
 		mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 
 		ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
 		mtspr(SPR_PTCR, ptcr);
 		mmu_radix_init_amor();
 	}
 	mmu_radix_init_iamr();
 	mmu_radix_pid_set(kernel_pmap);
 	if (powernv_enabled)
 		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
 	else
 		mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
 }
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0,
     "2MB page mapping counters");
 
 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_demotions);
 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_l3e_demotions, "2MB page demotions");
 
 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_mappings);
 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_l3e_mappings, "2MB page mappings");
 
 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_p_failures);
 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_l3e_p_failures, "2MB page promotion failures");
 
 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_promotions);
 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_l3e_promotions, "2MB page promotions");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0,
     "1GB page mapping counters");
 
 static COUNTER_U64_DEFINE_EARLY(pmap_l2e_demotions);
 SYSCTL_COUNTER_U64(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_l2e_demotions, "1GB page demotions");
 
 void
 mmu_radix_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pml3_entry_t oldl3e, *l3e;
 	pt_entry_t oldpte, *pte;
 	struct rwlock *lock;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	vm_page_assert_busied(m);
 	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 	 * If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->a.flags & PGA_WRITEABLE) == 0)
 		return;
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_wlock(lock);
 restart:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		va = pv->pv_va;
 		l3e = pmap_pml3e(pmap, va);
 		oldl3e = be64toh(*l3e);
 		if ((oldl3e & PG_RW) != 0 &&
 		    pmap_demote_l3e_locked(pmap, l3e, va, &lock) &&
 		    (oldl3e & PG_W) == 0) {
 			/*
 			 * Write protect the mapping to a
 			 * single page so that a subsequent
 			 * write access may repromote.
 			 */
 			va += VM_PAGE_TO_PHYS(m) - (oldl3e &
 			    PG_PS_FRAME);
 			pte = pmap_l3e_to_pte(l3e, va);
 			oldpte = be64toh(*pte);
 			while (!atomic_cmpset_long(pte,
 			    htobe64(oldpte),
 				htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW))))
 				   oldpte = be64toh(*pte);
 			vm_page_dirty(m);
 			pmap_invalidate_page(pmap, va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		l3e = pmap_pml3e(pmap, pv->pv_va);
 		KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_l3e_to_pte(l3e, pv->pv_va);
 		if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			atomic_clear_long(pte, htobe64(PG_M));
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 }
 
 void
 mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
     vm_size_t len, vm_offset_t src_addr)
 {
 	struct rwlock *lock;
 	struct spglist free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t va_next;
 	vm_page_t dst_pdpg, dstmpte, srcmpte;
 	bool invalidate_all;
 
 	CTR6(KTR_PMAP,
 	    "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n",
 	    __func__, dst_pmap, src_pmap, dst_addr, len, src_addr);
 
 	if (dst_addr != src_addr)
 		return;
 	lock = NULL;
 	invalidate_all = false;
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		pml1_entry_t *l1e;
 		pml2_entry_t *l2e;
 		pml3_entry_t srcptepaddr, *l3e;
 		pt_entry_t *src_pte, *dst_pte;
 
 		l1e = pmap_pml1e(src_pmap, addr);
 		if ((be64toh(*l1e) & PG_V) == 0) {
 			va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 
 		l2e = pmap_l1e_to_l2e(l1e, addr);
 		if ((be64toh(*l2e) & PG_V) == 0) {
 			va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 
 		va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 		if (va_next < addr)
 			va_next = end_addr;
 
 		l3e = pmap_l2e_to_l3e(l2e, addr);
 		srcptepaddr = be64toh(*l3e);
 		if (srcptepaddr == 0)
 			continue;
 
 		if (srcptepaddr & RPTE_LEAF) {
 			if ((addr & L3_PAGE_MASK) != 0 ||
 			    addr + L3_PAGE_SIZE > end_addr)
 				continue;
 			dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL);
 			if (dst_pdpg == NULL)
 				break;
 			l3e = (pml3_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
 			l3e = &l3e[pmap_pml3e_index(addr)];
 			if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 			    pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr,
 			    PMAP_ENTER_NORECLAIM, &lock))) {
 				*l3e = htobe64(srcptepaddr & ~PG_W);
 				pmap_resident_count_inc(dst_pmap,
 				    L3_PAGE_SIZE / PAGE_SIZE);
 				counter_u64_add(pmap_l3e_mappings, 1);
 			} else
 				dst_pdpg->ref_count--;
 			continue;
 		}
 
 		srcptepaddr &= PG_FRAME;
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 		KASSERT(srcmpte->ref_count > 0,
 		    ("pmap_copy: source page table page is unused"));
 
 		if (va_next > end_addr)
 			va_next = end_addr;
 
 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 		src_pte = &src_pte[pmap_pte_index(addr)];
 		dstmpte = NULL;
 		while (addr < va_next) {
 			pt_entry_t ptetemp;
 			ptetemp = be64toh(*src_pte);
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				if (dstmpte != NULL &&
 				    dstmpte->pindex == pmap_l3e_pindex(addr))
 					dstmpte->ref_count++;
 				else if ((dstmpte = pmap_allocpte(dst_pmap,
 				    addr, NULL)) == NULL)
 					goto out;
 				dst_pte = (pt_entry_t *)
 				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 				dst_pte = &dst_pte[pmap_pte_index(addr)];
 				if (be64toh(*dst_pte) == 0 &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
 				    &lock)) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					*dst_pte = htobe64(ptetemp & ~(PG_W | PG_M |
 					    PG_A));
 					pmap_resident_count_inc(dst_pmap, 1);
 				} else {
 					SLIST_INIT(&free);
 					if (pmap_unwire_ptp(dst_pmap, addr,
 					    dstmpte, &free)) {
 						/*
 						 * Although "addr" is not
 						 * mapped, paging-structure
 						 * caches could nonetheless
 						 * have entries that refer to
 						 * the freed page table pages.
 						 * Invalidate those entries.
 						 */
 						invalidate_all = true;
 						vm_page_free_pages_toq(&free,
 						    true);
 					}
 					goto out;
 				}
 				if (dstmpte->ref_count >= srcmpte->ref_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			if (__predict_false((addr & L3_PAGE_MASK) == 0))
 				src_pte = pmap_pte(src_pmap, addr);
 			else
 				src_pte++;
 		}
 	}
 out:
 	if (invalidate_all)
 		pmap_invalidate_all(dst_pmap);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 static void
 mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst);
 	/*
 	 * XXX slow
 	 */
 	bcopy((void *)src, (void *)dst, PAGE_SIZE);
 }
 
 static void
 mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
         void *a_cp, *b_cp;
         vm_offset_t a_pg_offset, b_pg_offset;
         int cnt;
 
 	CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma,
 	    a_offset, mb, b_offset, xfersize);
         
         while (xfersize > 0) {
                 a_pg_offset = a_offset & PAGE_MASK;
                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
                 a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
                     VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
                     a_pg_offset;
                 b_pg_offset = b_offset & PAGE_MASK;
                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
                 b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
                     VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
                     b_pg_offset;
                 bcopy(a_cp, b_cp, cnt);
                 a_offset += cnt;
                 b_offset += cnt;
                 xfersize -= cnt;
         }
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * Tries to promote the 512, contiguous 4KB page mappings that are within a
  * single page table page (PTP) to a single 2MB page mapping.  For promotion
  * to occur, two conditions must be met: (1) the 4KB page mappings must map
  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
  * identical characteristics.
  */
 static int
 pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pml3_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
 	vm_page_t mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
 	 * either invalid, unused, or does not map the first 4KB physical page
 	 * within a 2MB page.
 	 */
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME);
 setpde:
 	newpde = be64toh(*firstpte);
 	if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 		CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		goto fail;
 	}
 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 		/*
 		 * When PG_M is already clear, PG_RW can be cleared without
 		 * a TLB invalidation.
 		 */
 		if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W)))
 			goto setpde;
 		newpde &= ~RPTE_EAA_W;
 	}
 
 	/*
 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
 	 * PTE maps an unexpected 4KB physical page or does not have identical
 	 * characteristics to the first PTE.
 	 */
 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE;
 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 setpte:
 		oldpte = be64toh(*pte);
 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 			CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			goto fail;
 		}
 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 			/*
 			 * When PG_M is already clear, PG_RW can be cleared
 			 * without a TLB invalidation.
 			 */
 			if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W)))
 				goto setpte;
 			oldpte &= ~RPTE_EAA_W;
 			CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx"
 			    " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) |
 			    (va & ~L3_PAGE_MASK), pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 			CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			goto fail;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the PDE
 	 * mapping the superpage is demoted by pmap_demote_pde() or
 	 * destroyed by pmap_remove_pde().
 	 */
 	mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_l3e: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_l3e_pindex(va),
 	    ("pmap_promote_l3e: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, mpte)) {
 		CTR2(KTR_PMAP,
 		    "pmap_promote_l3e: failure for va %#lx in pmap %p", va,
 		    pmap);
 		goto fail;
 	}
 
 	/*
 	 * Promote the pv entries.
 	 */
 	if ((newpde & PG_MANAGED) != 0)
 		pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp);
 
 	pte_store(pde, PG_PROMOTED | newpde);
 	ptesync();
 	counter_u64_add(pmap_l3e_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (0);
  fail:
 	counter_u64_add(pmap_l3e_p_failures, 1);
 	return (KERN_FAILURE);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 int
 mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, u_int flags, int8_t psind)
 {
 	struct rwlock *lock;
 	pml3_entry_t *l3e;
 	pt_entry_t *pte;
 	pt_entry_t newpte, origpte;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	int rv, retrycount;
 	boolean_t nosleep, invalidate_all, invalidate_page;
 
 	va = trunc_page(va);
 	retrycount = 0;
 	invalidate_page = invalidate_all = false;
 	CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va,
 	    m, prot, flags, psind);
 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va),
 	    ("pmap_enter: managed mapping within the clean submap"));
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
 
 	KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
 	    ("pmap_enter: flags %u has reserved bits set", flags));
 	pa = VM_PAGE_TO_PHYS(m);
 	newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF);
 	if ((flags & VM_PROT_WRITE) != 0)
 		newpte |= PG_M;
 	if ((flags & VM_PROT_READ) != 0)
 		newpte |= PG_A;
 	if (prot & VM_PROT_READ)
 		newpte |= RPTE_EAA_R;
 	if ((prot & VM_PROT_WRITE) != 0)
 		newpte |= RPTE_EAA_W;
 	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 
 	if (prot & VM_PROT_EXECUTE)
 		newpte |= PG_X;
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		newpte |= PG_W;
 	if (va >= DMAP_MIN_ADDRESS)
 		newpte |= RPTE_EAA_P;
 	newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs);
 	/*
 	 * Set modified bit gratuitously for writeable mappings if
 	 * the page is unmanaged. We do not want to take a fault
 	 * to do the dirty bit accounting for these mappings.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) != 0) {
 		if ((newpte & PG_RW) != 0)
 			newpte |= PG_M;
 	} else
 		newpte |= PG_MANAGED;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	if (psind == 1) {
 		/* Assert the required virtual and physical alignment. */
 		KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned"));
 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 		rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock);
 		goto out;
 	}
 	mpte = NULL;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 retry:
 	l3e = pmap_pml3e(pmap, va);
 	if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 ||
 	    pmap_demote_l3e_locked(pmap, l3e, va, &lock))) {
 		pte = pmap_l3e_to_pte(l3e, va);
 		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 			mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
 			mpte->ref_count++;
 		}
 	} else if (va < VM_MAXUSER_ADDRESS) {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 		mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va),
 		    nosleep ? NULL : &lock);
 		if (mpte == NULL && nosleep) {
 			rv = KERN_RESOURCE_SHORTAGE;
 			goto out;
 		}
 		if (__predict_false(retrycount++ == 6))
 			panic("too many retries");
 		invalidate_all = true;
 		goto retry;
 	} else
 		panic("pmap_enter: invalid page directory va=%#lx", va);
 
 	origpte = be64toh(*pte);
 	pv = NULL;
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if ((origpte & PG_V) != 0) {
 #ifdef INVARIANTS
 		if (VERBOSE_PMAP || pmap_logging) {
 			printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --"
 			    " asid=%lu curpid=%d name=%s origpte0x%lx\n",
 			    pmap, va, m, prot, flags, psind, pmap->pm_pid,
 			    curproc->p_pid, curproc->p_comm, origpte);
 			pmap_pte_walk(pmap->pm_pml1, va);
 		}
 #endif
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->ref_count--;
 			KASSERT(mpte->ref_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		opa = origpte & PG_FRAME;
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((origpte & PG_MANAGED) != 0 &&
 			    (newpte & PG_RW) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) {
 				if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) {
 					if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
 						goto retry;
 					if ((newpte & PG_M) != (origpte & PG_M))
 						vm_page_dirty(m);
 					if ((newpte & PG_A) != (origpte & PG_A))
 						vm_page_aflag_set(m, PGA_REFERENCED);
 					ptesync();
 				} else
 					invalidate_all = true;
 				if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 					goto unchanged;
 			}
 			goto validate;
 		}
 
 		/*
 		 * The physical page has changed.  Temporarily invalidate
 		 * the mapping.  This ensures that all threads sharing the
 		 * pmap keep a consistent view of the mapping, which is
 		 * necessary for the correct handling of COW faults.  It
 		 * also permits reuse of the old mapping's PV entry,
 		 * avoiding an allocation.
 		 *
 		 * For consistency, handle unmanaged mappings the same way.
 		 */
 		origpte = be64toh(pte_load_clear(pte));
 		KASSERT((origpte & PG_FRAME) == opa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((origpte & PG_MANAGED) != 0) {
 			om = PHYS_TO_VM_PAGE(opa);
 
 			/*
 			 * The pmap lock is sufficient to synchronize with
 			 * concurrent calls to pmap_page_test_mappings() and
 			 * pmap_ts_referenced().
 			 */
 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(om);
 			if ((origpte & PG_A) != 0)
 				vm_page_aflag_set(om, PGA_REFERENCED);
 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 			if ((newpte & PG_MANAGED) == 0)
 				free_pv_entry(pmap, pv);
 #ifdef INVARIANTS
 			else if (origpte & PG_MANAGED) {
 				if (pv == NULL) {
 					pmap_page_print_mappings(om);
 					MPASS(pv != NULL);
 				}
 			}
 #endif
 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		}
 		if ((origpte & PG_A) != 0)
 			invalidate_page = true;
 		origpte = 0;
 	} else {
 		if (pmap != kernel_pmap) {
 #ifdef INVARIANTS
 			if (VERBOSE_PMAP || pmap_logging)
 				printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n",
 				    pmap, va, m, prot, flags, psind,
 				    pmap->pm_pid, curproc->p_pid,
 				    curproc->p_comm);
 #endif
 		}
 
 		/*
 		 * Increment the counters.
 		 */
 		if ((newpte & PG_W) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap_resident_count_inc(pmap, 1);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((newpte & PG_MANAGED) != 0) {
 		if (pv == NULL) {
 			pv = get_pv_entry(pmap, &lock);
 			pv->pv_va = va;
 		}
 #ifdef VERBOSE_PV
 		else
 			printf("reassigning pv: %p to pmap: %p\n",
 				   pv, pmap);
 #endif
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
 		m->md.pv_gen++;
 		if ((newpte & PG_RW) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Update the PTE.
 	 */
 	if ((origpte & PG_V) != 0) {
 validate:
 		origpte = be64toh(pte_load_store(pte, htobe64(newpte)));
 		KASSERT((origpte & PG_FRAME) == pa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
 		    (PG_M | PG_RW)) {
 			if ((origpte & PG_MANAGED) != 0)
 				vm_page_dirty(m);
 			invalidate_page = true;
 
 			/*
 			 * Although the PTE may still have PG_RW set, TLB
 			 * invalidation may nonetheless be required because
 			 * the PTE no longer has PG_M set.
 			 */
 		} else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) {
 			/*
 			 * Removing capabilities requires invalidation on POWER
 			 */
 			invalidate_page = true;
 			goto unchanged;
 		}
 		if ((origpte & PG_A) != 0)
 			invalidate_page = true;
 	} else {
 		pte_store(pte, newpte);
 		ptesync();
 	}
 unchanged:
 
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * If both the page table page and the reservation are fully
 	 * populated, then attempt promotion.
 	 */
 	if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
 	    mmu_radix_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0 &&
 		pmap_promote_l3e(pmap, l3e, va, &lock) == 0)
 		invalidate_all = true;
 #endif
 	if (invalidate_all)
 		pmap_invalidate_all(pmap);
 	else if (invalidate_page)
 		pmap_invalidate_page(pmap, va);
 
 	rv = KERN_SUCCESS;
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 
 	return (rv);
 }
 
 /*
  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
  * if successful.  Returns false if (1) a page table page cannot be allocated
  * without sleeping, (2) a mapping already exists at the specified virtual
  * address, or (3) a PV entry cannot be allocated without reclaiming another
  * PV entry.
  */
 static bool
 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     struct rwlock **lockp)
 {
 	pml3_entry_t newpde;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) |
 	    RPTE_LEAF | PG_V;
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newpde |= PG_MANAGED;
 	if (prot & VM_PROT_EXECUTE)
 		newpde |= PG_X;
 	if (prot & VM_PROT_READ)
 		newpde |= RPTE_EAA_R;
 	if (va >= DMAP_MIN_ADDRESS)
 		newpde |= RPTE_EAA_P;
 	return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 	    KERN_SUCCESS);
 }
 
 /*
  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
  * a mapping already exists at the specified virtual address.  Returns
  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  *
  * The parameter "m" is only used when creating a managed, writeable mapping.
  */
 static int
 pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags,
     vm_page_t m, struct rwlock **lockp)
 {
 	struct spglist free;
 	pml3_entry_t oldl3e, *l3e;
 	vm_page_t mt, pdpg;
 
 	KASSERT((newpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_enter_pde: newpde is missing PG_M"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
 	    NULL : lockp)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 	l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 	l3e = &l3e[pmap_pml3e_index(va)];
 	oldl3e = be64toh(*l3e);
 	if ((oldl3e & PG_V) != 0) {
 		KASSERT(pdpg->ref_count > 1,
 		    ("pmap_enter_pde: pdpg's wire count is too low"));
 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 			pdpg->ref_count--;
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (KERN_FAILURE);
 		}
 		/* Break the existing mapping(s). */
 		SLIST_INIT(&free);
 		if ((oldl3e & RPTE_LEAF) != 0) {
 			/*
 			 * The reference to the PD page that was acquired by
 			 * pmap_allocl3e() ensures that it won't be freed.
 			 * However, if the PDE resulted from a promotion, then
 			 * a reserved PT page could be freed.
 			 */
 			(void)pmap_remove_l3e(pmap, l3e, va, &free, lockp);
 			pmap_invalidate_l3e_page(pmap, va, oldl3e);
 		} else {
 			if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e,
 			    &free, lockp))
 		               pmap_invalidate_all(pmap);
 		}
 		vm_page_free_pages_toq(&free, true);
 		if (va >= VM_MAXUSER_ADDRESS) {
 			mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
 			if (pmap_insert_pt_page(pmap, mt)) {
 				/*
 				 * XXX Currently, this can't happen because
 				 * we do not perform pmap_enter(psind == 1)
 				 * on the kernel pmap.
 				 */
 				panic("pmap_enter_pde: trie insert failed");
 			}
 		} else
 			KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p",
 			    l3e));
 	}
 	if ((newpde & PG_MANAGED) != 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
 				/*
 				 * Although "va" is not mapped, paging-
 				 * structure caches could nonetheless have
 				 * entries that refer to the freed page table
 				 * pages.  Invalidate those entries.
 				 */
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		if ((newpde & PG_RW) != 0) {
 			for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
 				vm_page_aflag_set(mt, PGA_WRITEABLE);
 		}
 	}
 
 	/*
 	 * Increment counters.
 	 */
 	if ((newpde & PG_W) != 0)
 		pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE;
 	pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
 
 	/*
 	 * Map the superpage.  (This is not a promoted mapping; there will not
 	 * be any lingering 4KB page mappings in the TLB.)
 	 */
 	pte_store(l3e, newpde);
 	ptesync();
 
 	counter_u64_add(pmap_l3e_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (KERN_SUCCESS);
 }
 
 void
 mmu_radix_enter_object(pmap_t pmap, vm_offset_t start,
     vm_offset_t end, vm_page_t m_start, vm_prot_t prot)
 {
 
 	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 	bool invalidate;
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start,
 	    end, m_start, prot);
 
 	invalidate = false;
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end &&
 		    m->psind == 1 && mmu_radix_ps_enabled(pmap) &&
 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
 			m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1];
 		else
 			mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot,
 			    mpte, &lock, &invalidate);
 		m = TAILQ_NEXT(m, listq);
 	}
 	ptesync();
 	if (lock != NULL)
 		rw_wunlock(lock);
 	if (invalidate)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate)
 {
 	struct spglist free;
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 
 	KASSERT(!VA_IS_CLEANMAP(va) ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("mmu_radix_enter_quick_locked: managed mapping within the clean submap"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t ptepindex;
 		pml3_entry_t *ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = pmap_l3e_pindex(va);
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->ref_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap_pml3e(pmap, va);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.  Otherwise, we
 			 * attempt to allocate a page table page.  If this
 			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (ptepa && (be64toh(*ptepa) & PG_V) != 0) {
 				if (be64toh(*ptepa) & RPTE_LEAF)
 					return (NULL);
 				mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME);
 				mpte->ref_count++;
 			} else {
 				/*
 				 * Pass NULL instead of the PV list lock
 				 * pointer, because we don't intend to sleep.
 				 */
 				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 		pte = &pte[pmap_pte_index(va)];
 	} else {
 		mpte = NULL;
 		pte = pmap_pte(pmap, va);
 	}
 	if (be64toh(*pte)) {
 		if (mpte != NULL) {
 			mpte->ref_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 				/*
 				 * Although "va" is not mapped, paging-
 				 * structure caches could nonetheless have
 				 * entries that refer to the freed page table
 				 * pages.  Invalidate those entries.
 				 */
 				*invalidate = true;
 				vm_page_free_pages_toq(&free, true);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap_resident_count_inc(pmap, 1);
 
 	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs);
 	if (prot & VM_PROT_EXECUTE)
 		pa |= PG_X;
 	else
 		pa |= RPTE_EAA_R;
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		pa |= PG_MANAGED;
 
 	pte_store(pte, pa);
 	return (mpte);
 }
 
 void
 mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot)
 {
 	struct rwlock *lock;
 	bool invalidate;
 
 	lock = NULL;
 	invalidate = false;
 	PMAP_LOCK(pmap);
 	mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock,
 	    &invalidate);
 	ptesync();
 	if (lock != NULL)
 		rw_wunlock(lock);
 	if (invalidate)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 vm_paddr_t
 mmu_radix_extract(pmap_t pmap, vm_offset_t va)
 {
 	pml3_entry_t *l3e;
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 
 	l3e = pmap_pml3e(pmap, va);
 	if (__predict_false(l3e == NULL))
 		return (0);
 	if (be64toh(*l3e) & RPTE_LEAF) {
 		pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
 		pa |= (va & L3_PAGE_MASK);
 	} else {
 		/*
 		 * Beware of a concurrent promotion that changes the
 		 * PDE at this point!  For example, vtopte() must not
 		 * be used to access the PTE because it would use the
 		 * new PDE.  It is, however, safe to use the old PDE
 		 * because the page table page is preserved by the
 		 * promotion.
 		 */
 		pte = pmap_l3e_to_pte(l3e, va);
 		if (__predict_false(pte == NULL))
 			return (0);
 		pa = be64toh(*pte);
 		pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 		pa |= (va & PAGE_MASK);
 	}
 	return (pa);
 }
 
 vm_page_t
 mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pml3_entry_t l3e, *l3ep;
 	pt_entry_t pte;
 	vm_page_t m;
 
 	m = NULL;
 	CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot);
 	PMAP_LOCK(pmap);
 	l3ep = pmap_pml3e(pmap, va);
 	if (l3ep != NULL && (l3e = be64toh(*l3ep))) {
 		if (l3e & RPTE_LEAF) {
 			if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0)
 				m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) |
 				    (va & L3_PAGE_MASK));
 		} else {
 			/* Native endian PTE, do not pass to pmap functions */
 			pte = be64toh(*pmap_l3e_to_pte(l3ep, va));
 			if ((pte & PG_V) &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0))
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 		}
 		if (m != NULL && !vm_page_wire_mapped(m))
 			m = NULL;
 	}
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 static void
 mmu_radix_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pml3_entry_t *l3e;
 	pml2_entry_t *l2e;
 
 	CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
 	if (VM_MIN_KERNEL_ADDRESS < addr &&
 		addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE))
 		return;
 
 	addr = roundup2(addr, L3_PAGE_SIZE);
 	if (addr - 1 >= vm_map_max(kernel_map))
 		addr = vm_map_max(kernel_map);
 	while (kernel_vm_end < addr) {
 		l2e = pmap_pml2e(kernel_pmap, kernel_vm_end);
 		if ((be64toh(*l2e) & PG_V) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			nkpg->pindex = kernel_vm_end >> L2_PAGE_SIZE_SHIFT;
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 			pde_store(l2e, paddr);
 			continue; /* try again */
 		}
 		l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end);
 		if ((be64toh(*l3e) & PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 				kernel_vm_end = vm_map_max(kernel_map);
 				break;
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		nkpg->pindex = pmap_l3e_pindex(kernel_vm_end);
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 		pde_store(l3e, paddr);
 
 		kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 			kernel_vm_end = vm_map_max(kernel_map);
 			break;
 		}
 	}
 	ptesync();
 }
 
 static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory");
 static uma_zone_t zone_radix_pgd;
 
 static int
 radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused,
     int flags)
 {
+	int req;
 
+	req = VM_ALLOC_WIRED | malloc2vm_flags(flags);
 	for (int i = 0; i < count; i++) {
-		vm_page_t m = vm_page_alloc_contig(NULL, 0,
-		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
-		    VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE,
+		vm_page_t m = vm_page_alloc_noobj_contig(req,
+		    RADIX_PGD_SIZE / PAGE_SIZE,
 		    0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE,
 		    VM_MEMATTR_DEFAULT);
-		/* XXX zero on alloc here so we don't have to later */
 		store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	}
 	return (count);
 }
 
 static void
 radix_pgd_release(void *arg __unused, void **store, int count)
 {
 	vm_page_t m;
 	struct spglist free;
 	int page_count;
 
 	SLIST_INIT(&free);
 	page_count = RADIX_PGD_SIZE/PAGE_SIZE;
 
 	for (int i = 0; i < count; i++) {
 		/*
 		 * XXX selectively remove dmap and KVA entries so we don't
 		 * need to bzero
 		 */
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
 		for (int j = page_count-1; j >= 0; j--) {
 			vm_page_unwire_noq(&m[j]);
 			SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss);
 		}
 		vm_page_free_pages_toq(&free, false);
 	}
 }
 
 static void
 mmu_radix_init()
 {
 	vm_page_t mpte;
 	vm_size_t s;
 	int error, i, pv_npg;
 
 	/* XXX is this really needed for POWER? */
 	/* L1TF, reserve page @0 unconditionally */
 	vm_page_blacklist_add(0, bootverbose);
 
 	zone_radix_pgd = uma_zcache_create("radix_pgd_cache",
 		RADIX_PGD_SIZE, NULL, NULL,
 #ifdef INVARIANTS
 	    trash_init, trash_fini,
 #else
 	    NULL, NULL,
 #endif
 		radix_pgd_import, radix_pgd_release,
 		NULL, UMA_ZONE_NOBUCKET);
 
 	/*
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */
 	PMAP_LOCK(kernel_pmap);
 	for (i = 0; i < nkpt; i++) {
 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 		KASSERT(mpte >= vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
 		    ("pmap_init: page table page is out of range size: %lu",
 		     vm_page_array_size));
 		mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i;
 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 		MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte);
 		//pmap_insert_pt_page(kernel_pmap, mpte);
 		mpte->ref_count = 1;
 	}
 	PMAP_UNLOCK(kernel_pmap);
 	vm_wire_add(nkpt);
 
 	CTR1(KTR_PMAP, "%s()", __func__);
 	TAILQ_INIT(&pv_dummy.pv_list);
 
 	/*
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
 	if (superpages_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = L3_PAGE_SIZE;
 	}
 
 	/*
 	 * Initialize the pv chunk list mutex.
 	 */
 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 
 	/*
 	 * Initialize the pool of pv list locks.
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 	TAILQ_INIT(&pv_dummy.pv_list);
 
 	pmap_initialized = 1;
 	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 	    (vmem_addr_t *)&qframe);
 
 	if (error != 0)
 		panic("qframe allocation failed");
 	asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits),
 	    1, 1, M_WAITOK);
 }
 
 static boolean_t
 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 {
 	struct rwlock *lock;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	pt_entry_t *pte, mask;
 	pmap_t pmap;
 	int md_gen, pvh_gen;
 	boolean_t rv;
 
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		mask = 0;
 		if (modified)
 			mask |= PG_RW | PG_M;
 		if (accessed)
 			mask |= PG_V | PG_A;
 		rv = (be64toh(*pte) & mask) == mask;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			goto out;
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pml3e(pmap, pv->pv_va);
 			mask = 0;
 			if (modified)
 				mask |= PG_RW | PG_M;
 			if (accessed)
 				mask |= PG_V | PG_A;
 			rv = (be64toh(*pte) & mask) == mask;
 			PMAP_UNLOCK(pmap);
 			if (rv)
 				goto out;
 		}
 	}
 out:
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 mmu_radix_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 	/*
 	 * If the page is not busied then this check is racy.
 	 */
 	if (!pmap_page_is_write_mapped(m))
 		return (FALSE);
 	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 boolean_t
 mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pml3_entry_t *l3e;
 	pt_entry_t *pte;
 	boolean_t rv;
 
 	CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	l3e = pmap_pml3e(pmap, addr);
 	if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) {
 		pte = pmap_l3e_to_pte(l3e, addr);
 		rv = (be64toh(*pte) & PG_V) == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 boolean_t
 mmu_radix_is_referenced(vm_page_t m)
 {
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  *
  *	A DI block is not needed within this function, because
  *	invalidations are performed before the PV list lock is
  *	released.
  */
 boolean_t
 mmu_radix_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pml3_entry_t oldl3e, *l3e;
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 	int cleared, md_gen, not_cleared, pvh_gen;
 	struct spglist free;
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	SLIST_INIT(&free);
 	cleared = 0;
 	pa = VM_PAGE_TO_PHYS(m);
 	lock = PHYS_TO_PV_LIST_LOCK(pa);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 	rw_wlock(lock);
 retry:
 	not_cleared = 0;
 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		l3e = pmap_pml3e(pmap, pv->pv_va);
 		oldl3e = be64toh(*l3e);
 		if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Although "oldpde" is mapping a 2MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 		if ((oldl3e & PG_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB
 			 * pages, it should not be cleared every time it is
 			 * tested.  Apply a simple "hash" function on the
 			 * physical page number, the virtual superpage number,
 			 * and the pmap address to select one 4KB page out of
 			 * the 512 on which testing the reference bit will
 			 * result in clearing that reference bit.  This
 			 * function is designed to avoid the selection of the
 			 * same 4KB page for every 2MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^
 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 			    (oldl3e & PG_W) == 0) {
 				atomic_clear_long(l3e, htobe64(PG_A));
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 				    ("inconsistent pv lock %p %p for page %p",
 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
 			pvh->pv_gen++;
 		}
 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		l3e = pmap_pml3e(pmap, pv->pv_va);
 		KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
 		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_l3e_to_pte(l3e, pv->pv_va);
 		if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if ((be64toh(*pte) & PG_A) != 0) {
 			atomic_clear_long(pte, htobe64(PG_A));
 			pmap_invalidate_page(pmap, pv->pv_va);
 			cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
 			m->md.pv_gen++;
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 	    not_cleared < PMAP_TS_REFERENCED_MAX);
 out:
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 	return (cleared + not_cleared);
 }
 
 static vm_offset_t
 mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start,
     vm_paddr_t end, int prot __unused)
 {
 
 	CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end,
 		 prot);
 	return (PHYS_TO_DMAP(start));
 }
 
 void
 mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size)
 {
 	pml3_entry_t *l3e;
 	vm_paddr_t pa, ptepa;
 	vm_page_t p, pdpg;
 	vm_memattr_t ma;
 
 	CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr,
 	    object, pindex, size);
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 			("pmap_object_init_pt: non-device object"));
 	/* NB: size can be logically ored with addr here */
 	if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) {
 		if (!mmu_radix_ps_enabled(pmap))
 			return;
 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
 			return;
 		p = vm_page_lookup(object, pindex);
 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
 		    ("pmap_object_init_pt: invalid page %p", p));
 		ma = p->md.mdpg_cache_attrs;
 
 		/*
 		 * Abort the mapping if the first page is not physically
 		 * aligned to a 2MB page boundary.
 		 */
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & L3_PAGE_MASK)
 			return;
 
 		/*
 		 * Skip the first page.  Abort the mapping if the rest of
 		 * the pages are not physically contiguous or have differing
 		 * memory attributes.
 		 */
 		p = TAILQ_NEXT(p, listq);
 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 		    pa += PAGE_SIZE) {
 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_object_init_pt: invalid page %p", p));
 			if (pa != VM_PAGE_TO_PHYS(p) ||
 			    ma != p->md.mdpg_cache_attrs)
 				return;
 			p = TAILQ_NEXT(p, listq);
 		}
 
 		PMAP_LOCK(pmap);
 		for (pa = ptepa | pmap_cache_bits(ma);
 		    pa < ptepa + size; pa += L3_PAGE_SIZE) {
 			pdpg = pmap_allocl3e(pmap, addr, NULL);
 			if (pdpg == NULL) {
 				/*
 				 * The creation of mappings below is only an
 				 * optimization.  If a page directory page
 				 * cannot be allocated without blocking,
 				 * continue on to the next mapping rather than
 				 * blocking.
 				 */
 				addr += L3_PAGE_SIZE;
 				continue;
 			}
 			l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 			l3e = &l3e[pmap_pml3e_index(addr)];
 			if ((be64toh(*l3e) & PG_V) == 0) {
 				pa |= PG_M | PG_A | PG_RW;
 				pte_store(l3e, pa);
 				pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
 				counter_u64_add(pmap_l3e_mappings, 1);
 			} else {
 				/* Continue on if the PDE is already valid. */
 				pdpg->ref_count--;
 				KASSERT(pdpg->ref_count > 0,
 				    ("pmap_object_init_pt: missing reference "
 				    "to page directory page, va: 0x%lx", addr));
 			}
 			addr += L3_PAGE_SIZE;
 		}
 		ptesync();
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 boolean_t
 mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m);
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_runlock(lock);
 	return (rv);
 }
 
 void
 mmu_radix_page_init(vm_page_t m)
 {
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
 }
 
 int
 mmu_radix_page_wired_mappings(vm_page_t m)
 {
 	struct rwlock *lock;
 	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 	int count, md_gen, pvh_gen;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (0);
 	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	count = 0;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		if ((be64toh(*pte) & PG_W) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pml3e(pmap, pv->pv_va);
 			if ((be64toh(*pte) & PG_W) != 0)
 				count++;
 			PMAP_UNLOCK(pmap);
 		}
 	}
 	rw_runlock(lock);
 	return (count);
 }
 
 static void
 mmu_radix_update_proctab(int pid, pml1_entry_t l1pa)
 {
 	isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE |  l1pa | RADIX_PGD_INDEX_SHIFT);
 }
 
 int
 mmu_radix_pinit(pmap_t pmap)
 {
 	vmem_addr_t pid;
 	vm_paddr_t l1pa;
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
 
 	/*
 	 * allocate the page directory page
 	 */
 	pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK);
 
 	for (int j = 0; j <  RADIX_PGD_SIZE_SHIFT; j++)
 		pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE);
 	pmap->pm_radix.rt_root = 0;
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_flags = PMAP_PDE_SUPERPAGE;
 	vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid);
 
 	pmap->pm_pid = pid;
 	l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1);
 	mmu_radix_update_proctab(pid, l1pa);
 	__asm __volatile("ptesync;isync" : : : "memory");
 
 	return (1);
 }
 
 /*
  * This routine is called if the desired page table page does not exist.
  *
  * If page table page allocation fails, this routine may sleep before
  * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, pdppg, pdpg;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (lockp != NULL) {
 			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
 			vm_wait(NULL);
 			PMAP_LOCK(pmap);
 		}
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	m->pindex = ptepindex;
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	if (ptepindex >= (NUPDE + NUPDPE)) {
 		pml1_entry_t *l1e;
 		vm_pindex_t pml1index;
 
 		/* Wire up a new PDPE page */
 		pml1index = ptepindex - (NUPDE + NUPDPE);
 		l1e = &pmap->pm_pml1[pml1index];
 		KASSERT((be64toh(*l1e) & PG_V) == 0,
 		    ("%s: L1 entry %#lx is valid", __func__, *l1e));
 		pde_store(l1e, VM_PAGE_TO_PHYS(m));
 	} else if (ptepindex >= NUPDE) {
 		vm_pindex_t pml1index;
 		vm_pindex_t pdpindex;
 		pml1_entry_t *l1e;
 		pml2_entry_t *l2e;
 
 		/* Wire up a new l2e page */
 		pdpindex = ptepindex - NUPDE;
 		pml1index = pdpindex >> RPTE_SHIFT;
 
 		l1e = &pmap->pm_pml1[pml1index];
 		if ((be64toh(*l1e) & PG_V) == 0) {
 			/* Have to allocate a new pdp, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index,
 				lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 		} else {
 			/* Add reference to l2e page */
 			pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME);
 			pdppg->ref_count++;
 		}
 		l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
 
 		/* Now find the pdp page */
 		l2e = &l2e[pdpindex & RPTE_MASK];
 		KASSERT((be64toh(*l2e) & PG_V) == 0,
 		    ("%s: L2 entry %#lx is valid", __func__, *l2e));
 		pde_store(l2e, VM_PAGE_TO_PHYS(m));
 	} else {
 		vm_pindex_t pml1index;
 		vm_pindex_t pdpindex;
 		pml1_entry_t *l1e;
 		pml2_entry_t *l2e;
 		pml3_entry_t *l3e;
 
 		/* Wire up a new PTE page */
 		pdpindex = ptepindex >> RPTE_SHIFT;
 		pml1index = pdpindex >> RPTE_SHIFT;
 
 		/* First, find the pdp and check that its valid. */
 		l1e = &pmap->pm_pml1[pml1index];
 		if ((be64toh(*l1e) & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 			l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
 			l2e = &l2e[pdpindex & RPTE_MASK];
 		} else {
 			l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
 			l2e = &l2e[pdpindex & RPTE_MASK];
 			if ((be64toh(*l2e) & PG_V) == 0) {
 				/* Have to allocate a new pd, recurse */
 				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 				    lockp) == NULL) {
 					vm_page_unwire_noq(m);
 					vm_page_free_zero(m);
 					return (NULL);
 				}
 			} else {
 				/* Add reference to the pd page */
 				pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME);
 				pdpg->ref_count++;
 			}
 		}
 		l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME);
 
 		/* Now we know where the page directory page is */
 		l3e = &l3e[ptepindex & RPTE_MASK];
 		KASSERT((be64toh(*l3e) & PG_V) == 0,
 		    ("%s: L3 entry %#lx is valid", __func__, *l3e));
 		pde_store(l3e, VM_PAGE_TO_PHYS(m));
 	}
 
 	pmap_resident_count_inc(pmap, 1);
 	return (m);
 }
 static vm_page_t
 pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t pdpindex, ptepindex;
 	pml2_entry_t *pdpe;
 	vm_page_t pdpg;
 
 retry:
 	pdpe = pmap_pml2e(pmap, va);
 	if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) {
 		/* Add a reference to the pd page. */
 		pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME);
 		pdpg->ref_count++;
 	} else {
 		/* Allocate a pd page. */
 		ptepindex = pmap_l3e_pindex(va);
 		pdpindex = ptepindex >> RPTE_SHIFT;
 		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
 		if (pdpg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (pdpg);
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pml3_entry_t *pd;
 	vm_page_t m;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_l3e_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	pd = pmap_pml3e(pmap, va);
 
 	/*
 	 * This supports switching from a 2MB page to a
 	 * normal 4K page.
 	 */
 	if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) {
 		if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) {
 			/*
 			 * Invalidation of the 2MB page mapping may have caused
 			 * the deallocation of the underlying PD page.
 			 */
 			pd = NULL;
 		}
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (pd != NULL && (be64toh(*pd) & PG_V) != 0) {
 		m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME);
 		m->ref_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, lockp);
 		if (m == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (m);
 }
 
 static void
 mmu_radix_pinit0(pmap_t pmap)
 {
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml1 = kernel_pmap->pm_pml1;
 	pmap->pm_pid = kernel_pmap->pm_pid;
 
 	pmap->pm_radix.rt_root = 0;
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	kernel_pmap->pm_flags =
 		pmap->pm_flags = PMAP_PDE_SUPERPAGE;
 }
 /*
  * pmap_protect_l3e: do the things to protect a 2mpage in a process
  */
 static boolean_t
 pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot)
 {
 	pt_entry_t newpde, oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m;
 	boolean_t anychanged;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & L3_PAGE_MASK) == 0,
 	    ("pmap_protect_l3e: sva is not 2mpage aligned"));
 	anychanged = FALSE;
 retry:
 	oldpde = newpde = be64toh(*l3e);
 	if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 	    (PG_MANAGED | PG_M | PG_RW)) {
 		eva = sva + L3_PAGE_SIZE;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++)
 			vm_page_dirty(m);
 	}
 	if ((prot & VM_PROT_WRITE) == 0) {
 		newpde &= ~(PG_RW | PG_M);
 		newpde |= RPTE_EAA_R;
 	}
 	if (prot & VM_PROT_EXECUTE)
 		newpde |= PG_X;
 	if (newpde != oldpde) {
 		/*
 		 * As an optimization to future operations on this PDE, clear
 		 * PG_PROMOTED.  The impending invalidation will remove any
 		 * lingering 4KB page mappings from the TLB.
 		 */
 		if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED)))
 			goto retry;
 		anychanged = TRUE;
 	}
 	return (anychanged);
 }
 
 void
 mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     vm_prot_t prot)
 {
 	vm_offset_t va_next;
 	pml1_entry_t *l1e;
 	pml2_entry_t *l2e;
 	pml3_entry_t ptpaddr, *l3e;
 	pt_entry_t *pte;
 	boolean_t anychanged;
 
 	CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva,
 	    prot);
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		mmu_radix_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 #ifdef INVARIANTS
 	if (VERBOSE_PROTECT || pmap_logging)
 		printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n",
 			   pmap, sva, eva, prot, pmap->pm_pid);
 #endif
 	anychanged = FALSE;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l1e = pmap_pml1e(pmap, sva);
 		if ((be64toh(*l1e) & PG_V) == 0) {
 			va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l2e = pmap_l1e_to_l2e(l1e, sva);
 		if ((be64toh(*l2e) & PG_V) == 0) {
 			va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 		if (va_next < sva)
 			va_next = eva;
 
 		l3e = pmap_l2e_to_l3e(l2e, sva);
 		ptpaddr = be64toh(*l3e);
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & RPTE_LEAF) != 0) {
 			/*
 			 * Are we protecting the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
 				if (pmap_protect_l3e(pmap, l3e, sva, prot))
 					anychanged = TRUE;
 				continue;
 			} else if (!pmap_demote_l3e(pmap, l3e, sva)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			pt_entry_t obits, pbits;
 			vm_page_t m;
 
 retry:
 			MPASS(pte == pmap_pte(pmap, sva));
 			obits = pbits = be64toh(*pte);
 			if ((pbits & PG_V) == 0)
 				continue;
 
 			if ((prot & VM_PROT_WRITE) == 0) {
 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 				    (PG_MANAGED | PG_M | PG_RW)) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				pbits &= ~(PG_RW | PG_M);
 				pbits |= RPTE_EAA_R;
 			}
 			if (prot & VM_PROT_EXECUTE)
 				pbits |= PG_X;
 
 			if (pbits != obits) {
 				if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits)))
 					goto retry;
 				if (obits & (PG_A|PG_M)) {
 					anychanged = TRUE;
 #ifdef INVARIANTS
 					if (VERBOSE_PROTECT || pmap_logging)
 						printf("%#lx %#lx -> %#lx\n",
 						    sva, obits, pbits);
 #endif
 				}
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 void
 mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 
 	CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count);
 	pt_entry_t oldpte, pa, *pte;
 	vm_page_t m;
 	uint64_t cache_bits, attr_bits;
 	vm_offset_t va;
 
 	oldpte = 0;
 	attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
 	va = sva;
 	pte = kvtopte(va);
 	while (va < sva + PAGE_SIZE * count) {
 		if (__predict_false((va & L3_PAGE_MASK) == 0))
 			pte = kvtopte(va);
 		MPASS(pte == pmap_pte(kernel_pmap, va));
 
 		/*
 		 * XXX there has to be a more efficient way than traversing
 		 * the page table every time - but go for correctness for
 		 * today
 		 */
 
 		m = *ma++;
 		cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs);
 		pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits;
 		if (be64toh(*pte) != pa) {
 			oldpte |= be64toh(*pte);
 			pte_store(pte, pa);
 		}
 		va += PAGE_SIZE;
 		pte++;
 	}
 	if (__predict_false((oldpte & RPTE_VALID) != 0))
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 	else
 		ptesync();
 }
 
 void
 mmu_radix_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 	pt_entry_t *pte;
 
 	CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count);
 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva));
 
 	va = sva;
 	pte = kvtopte(va);
 	while (va < sva + PAGE_SIZE * count) {
 		if (__predict_false((va & L3_PAGE_MASK) == 0))
 			pte = kvtopte(va);
 		pte_clear(pte);
 		pte++;
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_insert(&pmap->pm_radix, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va)));
 }
 
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->ref_count;
 	if (m->ref_count == 0) {
 		_pmap_unwire_ptp(pmap, va, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/*
 	 * unmap the page table page
 	 */
 	if (m->pindex >= NUPDE + NUPDPE) {
 		/* PDP page */
 		pml1_entry_t *pml1;
 		pml1 = pmap_pml1e(pmap, va);
 		*pml1 = 0;
 	} else if (m->pindex >= NUPDE) {
 		/* PD page */
 		pml2_entry_t *l2e;
 		l2e = pmap_pml2e(pmap, va);
 		*l2e = 0;
 	} else {
 		/* PTE page */
 		pml3_entry_t *l3e;
 		l3e = pmap_pml3e(pmap, va);
 		*l3e = 0;
 	}
 	pmap_resident_count_dec(pmap, 1);
 	if (m->pindex < NUPDE) {
 		/* We just released a PT, unhold the matching PD */
 		vm_page_t pdpg;
 
 		pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdpg, free);
 	}
 	else if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 		/* We just released a PD, unhold the matching PDP */
 		vm_page_t pdppg;
 
 		pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdppg, free);
 	}
 
 	/*
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde,
     struct spglist *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return (pmap_unwire_ptp(pmap, va, mpte, free));
 }
 
 void
 mmu_radix_release(pmap_t pmap)
 {
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_radix),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 
 	pmap_invalidate_all(pmap);
 	isa3_proctab[pmap->pm_pid].proctab0 = 0;
 	uma_zfree(zone_radix_pgd, pmap->pm_pml1);
 	vmem_free(asid_arena, pmap->pm_pid, 1);
 }
 
 /*
  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
  * false if the PV entry cannot be allocated without resorting to reclamation.
  */
 static bool
 pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_paddr_t pa;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 	    NULL : lockp)) == NULL)
 		return (false);
 	pv->pv_va = va;
 	pa = pde & PG_PS_FRAME;
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
 	pvh->pv_gen++;
 	return (true);
 }
 
 /*
  * Fills a page table page with mappings to consecutive physical pages.
  */
 static void
 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 {
 	pt_entry_t *pte;
 
 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 		*pte = htobe64(newpte);
 		newpte += PAGE_SIZE;
 	}
 }
 
 static boolean_t
 pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va)
 {
 	struct rwlock *lock;
 	boolean_t rv;
 
 	lock = NULL;
 	rv = pmap_demote_l3e_locked(pmap, pde, va, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	return (rv);
 }
 
 static boolean_t
 pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pml3_entry_t oldpde;
 	pt_entry_t *firstpte;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 	struct spglist free;
 	vm_offset_t sva;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpde = be64toh(*l3e);
 	KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
 	    ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx",
 	    oldpde));
 	if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
 	    NULL) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_l3e: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * Invalidate the 2MB page mapping and return "failure" if the
 		 * mapping was never accessed or the allocation of the new
 		 * page table page fails.  If the 2MB page mapping belongs to
 		 * the direct map region of the kernel's address space, then
 		 * the page allocation request specifies the highest possible
 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
 		 * normal.  Page table pages are preallocated for every other
 		 * part of the kernel address space, so the direct map region
 		 * is the only part of the kernel address space that must be
 		 * handled here.
 		 */
 		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc_noobj(
 		    (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS ?
 		    VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED)) == NULL) {
 			SLIST_INIT(&free);
 			sva = trunc_2mpage(va);
 			pmap_remove_l3e(pmap, l3e, sva, &free, lockp);
 			pmap_invalidate_l3e_page(pmap, sva, oldpde);
 			vm_page_free_pages_toq(&free, true);
 			CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (FALSE);
 		}
 		mpte->pindex = pmap_l3e_pindex(va);
 		if (va < VM_MAXUSER_ADDRESS)
 			pmap_resident_count_inc(pmap, 1);
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 	KASSERT((oldpde & PG_A) != 0,
 	    ("pmap_demote_l3e: oldpde is missing PG_A"));
 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_l3e: oldpde is missing PG_M"));
 
 	/*
 	 * If the page table page is new, initialize it.
 	 */
 	if (mpte->ref_count == 1) {
 		mpte->ref_count = NPTEPG;
 		pmap_fill_ptp(firstpte, oldpde);
 	}
 
 	KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME),
 	    ("pmap_demote_l3e: firstpte and newpte map different physical"
 	    " addresses"));
 
 	/*
 	 * If the mapping has changed attributes, update the page table
 	 * entries.
 	 */
 	if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE))
 		pmap_fill_ptp(firstpte, oldpde);
 
 	/*
 	 * The spare PV entries must be reserved prior to demoting the
 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
 	 * of the PDE and the PV lists will be inconsistent, which can result
 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
 	 * wrong PV list and pmap_pv_demote_l3e() failing to find the expected
 	 * PV entry for the 2MB page mapping that is being demoted.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
 
 	/*
 	 * Demote the mapping.  This pmap is locked.  The old PDE has
 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 	 * set.  Thus, there is no danger of a race with another
 	 * processor changing the setting of PG_A and/or PG_M between
 	 * the read above and the store below.
 	 */
 	pde_store(l3e, mptepa);
 	pmap_invalidate_l3e_page(pmap, trunc_2mpage(va), oldpde);
 	/*
 	 * Demote the PV entry.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp);
 
 	counter_u64_add(pmap_l3e_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
  */
 static void
 pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va)
 {
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL)
 		panic("pmap_remove_kernel_pde: Missing pt page.");
 
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 
 	/*
 	 * Initialize the page table page.
 	 */
 	pagezero(PHYS_TO_DMAP(mptepa));
 
 	/*
 	 * Demote the mapping.
 	 */
 	pde_store(l3e, mptepa);
 	ptesync();
 }
 
 /*
  * pmap_remove_l3e: do the things to unmap a superpage in a process
  */
 static int
 pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pml3_entry_t oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m, mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & L3_PAGE_MASK) == 0,
 	    ("pmap_remove_l3e: sva is not 2mpage aligned"));
 	oldpde = be64toh(pte_load_clear(pdq));
 	if (oldpde & PG_W)
 		pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE);
 	pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
 	if (oldpde & PG_MANAGED) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + L3_PAGE_SIZE;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(m);
 			if (oldpde & PG_A)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_l3e(pmap, pdq, sva);
 	} else {
 		mpte = pmap_remove_pt_page(pmap, sva);
 		if (mpte != NULL) {
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(mpte->ref_count == NPTEPG,
 			    ("pmap_remove_l3e: pte page wire count error"));
 			mpte->ref_count = 0;
 			pmap_add_delayed_free_list(mpte, free, FALSE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free));
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
     pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = be64toh(pte_load_clear(ptq));
 	if (oldpte & RPTE_WIRED)
 		pmap->pm_stats.wired_count -= 1;
 	pmap_resident_count_dec(pmap, 1);
 	if (oldpte & RPTE_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		pmap_pvh_free(&m->md, pmap, va);
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->flags & PG_FICTITIOUS) == 0) {
 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, va, ptepde, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static bool
 pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e,
     struct spglist *free)
 {
 	struct rwlock *lock;
 	pt_entry_t *pte;
 	bool invalidate_all;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((be64toh(*l3e) & RPTE_VALID) == 0) {
 		return (false);
 	}
 	pte = pmap_l3e_to_pte(l3e, va);
 	if ((be64toh(*pte) & RPTE_VALID) == 0) {
 		return (false);
 	}
 	lock = NULL;
 
 	invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	if (!invalidate_all)
 		pmap_invalidate_page(pmap, va);
 	return (invalidate_all);
 }
 
 /*
  * Removes the specified range of addresses from the page table page.
  */
 static bool
 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 	bool anyvalid;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	anyvalid = false;
 	va = eva;
 	for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++,
 	    sva += PAGE_SIZE) {
 		MPASS(pte == pmap_pte(pmap, sva));
 		if (*pte == 0) {
 			if (va != eva) {
 				anyvalid = true;
 				va = eva;
 			}
 			continue;
 		}
 		if (va == eva)
 			va = sva;
 		if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) {
 			anyvalid = true;
 			sva += PAGE_SIZE;
 			break;
 		}
 	}
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	else if (va != eva)
 		pmap_invalidate_range(pmap, va, sva);
 	return (anyvalid);
 }
 
 void
 mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct rwlock *lock;
 	vm_offset_t va_next;
 	pml1_entry_t *l1e;
 	pml2_entry_t *l2e;
 	pml3_entry_t ptpaddr, *l3e;
 	struct spglist free;
 	bool anyvalid;
 
 	CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = false;
 	SLIST_INIT(&free);
 
 	/* XXX something fishy here */
 	sva = (sva + PAGE_MASK) & ~PAGE_MASK;
 	eva = (eva + PAGE_MASK) & ~PAGE_MASK;
 
 	PMAP_LOCK(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		l3e = pmap_pml3e(pmap, sva);
 		if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) {
 			anyvalid = pmap_remove_page(pmap, sva, l3e, &free);
 			goto out;
 		}
 	}
 
 	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 		l1e = pmap_pml1e(pmap, sva);
 		if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) {
 			va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l2e = pmap_l1e_to_l2e(l1e, sva);
 		if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) {
 			va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 		if (va_next < sva)
 			va_next = eva;
 
 		l3e = pmap_l2e_to_l3e(l2e, sva);
 		ptpaddr = be64toh(*l3e);
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & RPTE_LEAF) != 0) {
 			/*
 			 * Are we removing the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
 				pmap_remove_l3e(pmap, l3e, sva, &free, &lock);
 				anyvalid = true;
 				continue;
 			} else if (!pmap_demote_l3e_locked(pmap, l3e, sva,
 			    &lock)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			} else
 				ptpaddr = be64toh(*l3e);
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock))
 			anyvalid = true;
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 out:
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 void
 mmu_radix_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pt_entry_t *pte, tpte;
 	pml3_entry_t *l3e;
 	vm_offset_t va;
 	struct spglist free;
 	int pvh_gen, md_gen;
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry:
 	rw_wlock(lock);
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		l3e = pmap_pml3e(pmap, va);
 		(void)pmap_demote_l3e_locked(pmap, l3e, va, &lock);
 		PMAP_UNLOCK(pmap);
 	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		pmap_resident_count_dec(pmap, 1);
 		l3e = pmap_pml3e(pmap, pv->pv_va);
 		KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_l3e_to_pte(l3e, pv->pv_va);
 		tpte = be64toh(pte_load_clear(pte));
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * Destroy all managed, non-wired mappings in the given user-space
  * pmap.  This pmap cannot be active on any processor besides the
  * caller.
  *
  * This function cannot be applied to the kernel pmap.  Moreover, it
  * is not intended for general use.  It is only to be used during
  * process termination.  Consequently, it can be implemented in ways
  * that make it faster than pmap_remove().  First, it can more quickly
  * destroy mappings by iterating over the pmap's collection of PV
  * entries, rather than searching the page table.  Second, it doesn't
  * have to test and clear the page table entries atomically, because
  * no processor is currently accessing the user address space.  In
  * particular, a page table entry's dirty bit won't change state once
  * this function starts.
  *
  * Although this function destroys all of the pmap's managed,
  * non-wired mappings, it can delay and batch the invalidation of TLB
  * entries without calling pmap_delayed_invl_started() and
  * pmap_delayed_invl_finished().  Because the pmap is not active on
  * any other processor, none of these TLB entries will ever be used
  * before their eventual invalidation.  Consequently, there is no need
  * for either pmap_remove_all() or pmap_remove_write() to wait for
  * that eventual TLB invalidation.
  */
 
 void
 mmu_radix_remove_pages(pmap_t pmap)
 {
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
 	pml3_entry_t ptel3e;
 	pt_entry_t *pte, tpte;
 	struct spglist free;
 	vm_page_t m, mpte, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, idx;
 	boolean_t superpage;
 	vm_paddr_t pa;
 
 	/*
 	 * Assert that the given pmap is only active on the current
 	 * CPU.  Unfortunately, we cannot block another CPU from
 	 * activating the pmap while this function is executing.
 	 */
 	KASSERT(pmap->pm_pid == mfspr(SPR_PID),
 	    ("non-current asid %lu - expected %lu", pmap->pm_pid,
 	    mfspr(SPR_PID)));
 
 	lock = NULL;
 
 	SLIST_INIT(&free);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = cnttzd(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = pmap_pml2e(pmap, pv->pv_va);
 				ptel3e = be64toh(*pte);
 				pte = pmap_l2e_to_l3e(pte, pv->pv_va);
 				tpte = be64toh(*pte);
 				if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) {
 					superpage = FALSE;
 					ptel3e = tpte;
 					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 					    PG_FRAME);
 					pte = &pte[pmap_pte_index(pv->pv_va)];
 					tpte = be64toh(*pte);
 				} else {
 					/*
 					 * Keep track whether 'tpte' is a
 					 * superpage explicitly instead of
 					 * relying on RPTE_LEAF being set.
 					 *
 					 * This is because RPTE_LEAF is numerically
 					 * identical to PG_PTE_PAT and thus a
 					 * regular page could be mistaken for
 					 * a superpage.
 					 */
 					superpage = TRUE;
 				}
 
 				if ((tpte & PG_V) == 0) {
 					panic("bad pte va %lx pte %lx",
 					    pv->pv_va, tpte);
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				if (superpage)
 					pa = tpte & PG_PS_FRAME;
 				else
 					pa = tpte & PG_FRAME;
 
 				m = PHYS_TO_VM_PAGE(pa);
 				KASSERT(m->phys_addr == pa,
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad tpte %#jx",
 				    (uintmax_t)tpte));
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 					if (superpage) {
 						for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 					} else
 						vm_page_dirty(m);
 				}
 
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
 				if (superpage) {
 					pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
 					pvh->pv_gen++;
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
 							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
 							    TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 					if (mpte != NULL) {
 						pmap_resident_count_dec(pmap, 1);
 						KASSERT(mpte->ref_count == NPTEPG,
 						    ("pmap_remove_pages: pte page wire count error"));
 						mpte->ref_count = 0;
 						pmap_add_delayed_free_list(mpte, &free, FALSE);
 					}
 				} else {
 					pmap_resident_count_dec(pmap, 1);
 #ifdef VERBOSE_PV
 					printf("freeing pv (%p, %p)\n",
 						   pmap, pv);
 #endif
 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
 					m->md.pv_gen++;
 					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
 					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m, PGA_WRITEABLE);
 					}
 				}
 				pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free);
 				freed++;
 			}
 		}
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 void
 mmu_radix_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pv_entry_t next_pv, pv;
 	pml3_entry_t *l3e;
 	pt_entry_t oldpte, *pte;
 	int pvh_gen, md_gen;
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 	vm_page_assert_busied(m);
 
 	if (!pmap_page_is_write_mapped(m))
 		return;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry_pv_loop:
 	rw_wlock(lock);
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		l3e = pmap_pml3e(pmap, pv->pv_va);
 		if ((be64toh(*l3e) & PG_RW) != 0)
 			(void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock);
 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 		    ("inconsistent pv lock %p %p for page %p",
 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen ||
 			    md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		l3e = pmap_pml3e(pmap, pv->pv_va);
 		KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
 		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_l3e_to_pte(l3e, pv->pv_va);
 retry:
 		oldpte = be64toh(*pte);
 		if (oldpte & PG_RW) {
 			if (!atomic_cmpset_long(pte, htobe64(oldpte),
 			    htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M))))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware
  *	feature, so there is no need to invalidate any TLB entries.
  *	Since pmap_demote_l3e() for the wired entry must never fail,
  *	pmap_delayed_invl_started()/finished() calls around the
  *	function are not needed.
  */
 void
 mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pml1_entry_t *l1e;
 	pml2_entry_t *l2e;
 	pml3_entry_t *l3e;
 	pt_entry_t *pte;
 
 	CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l1e = pmap_pml1e(pmap, sva);
 		if ((be64toh(*l1e) & PG_V) == 0) {
 			va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		l2e = pmap_l1e_to_l2e(l1e, sva);
 		if ((be64toh(*l2e) & PG_V) == 0) {
 			va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 		if (va_next < sva)
 			va_next = eva;
 		l3e = pmap_l2e_to_l3e(l2e, sva);
 		if ((be64toh(*l3e) & PG_V) == 0)
 			continue;
 		if ((be64toh(*l3e) & RPTE_LEAF) != 0) {
 			if ((be64toh(*l3e) & PG_W) == 0)
 				panic("pmap_unwire: pde %#jx is missing PG_W",
 				    (uintmax_t)(be64toh(*l3e)));
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
 				atomic_clear_long(l3e, htobe64(PG_W));
 				pmap->pm_stats.wired_count -= L3_PAGE_SIZE /
 				    PAGE_SIZE;
 				continue;
 			} else if (!pmap_demote_l3e(pmap, l3e, sva))
 				panic("pmap_unwire: demotion failed");
 		}
 		if (va_next > eva)
 			va_next = eva;
 		for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			MPASS(pte == pmap_pte(pmap, sva));
 			if ((be64toh(*pte) & PG_V) == 0)
 				continue;
 			if ((be64toh(*pte) & PG_W) == 0)
 				panic("pmap_unwire: pte %#jx is missing PG_W",
 				    (uintmax_t)(be64toh(*pte)));
 
 			/*
 			 * PG_W must be cleared atomically.  Although the pmap
 			 * lock synchronizes access to PG_W, another processor
 			 * could be setting PG_M and/or PG_A concurrently.
 			 */
 			atomic_clear_long(pte, htobe64(PG_W));
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 void
 mmu_radix_zero_page(vm_page_t m)
 {
 	vm_offset_t addr;
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 	addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	pagezero(addr);
 }
 
 void
 mmu_radix_zero_page_area(vm_page_t m, int off, int size)
 {
 	caddr_t addr;
 
 	CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size);
 	MPASS(off + size <= PAGE_SIZE);
 	addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	memset(addr + off, 0, size);
 }
 
 static int
 mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pml3_entry_t *l3ep;
 	pt_entry_t pte;
 	vm_paddr_t pa;
 	int val;
 
 	CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
 	PMAP_LOCK(pmap);
 
 	l3ep = pmap_pml3e(pmap, addr);
 	if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) {
 		if (be64toh(*l3ep) & RPTE_LEAF) {
 			pte = be64toh(*l3ep);
 			/* Compute the physical address of the 4KB page. */
 			pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) &
 			    PG_FRAME;
 			val = MINCORE_PSIND(1);
 		} else {
 			/* Native endian PTE, do not pass to functions */
 			pte = be64toh(*pmap_l3e_to_pte(l3ep, addr));
 			pa = pte & PG_FRAME;
 			val = 0;
 		}
 	} else {
 		pte = 0;
 		pa = 0;
 		val = 0;
 	}
 	if ((pte & PG_V) != 0) {
 		val |= MINCORE_INCORE;
 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((pte & PG_A) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 		*locked_pa = pa;
 	}
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 void
 mmu_radix_activate(struct thread *td)
 {
 	pmap_t pmap;
 	uint32_t curpid;
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, td);
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	curpid = mfspr(SPR_PID);
 	if (pmap->pm_pid > isa3_base_pid &&
 		curpid != pmap->pm_pid) {
 		mmu_radix_pid_set(pmap);
 	}
 	critical_exit();
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 
 	CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr,
 	    size);
 	vm_offset_t superpage_offset;
 
 	if (size < L3_PAGE_SIZE)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & L3_PAGE_MASK;
 	if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE ||
 	    (*addr & L3_PAGE_MASK) == superpage_offset)
 		return;
 	if ((*addr & L3_PAGE_MASK) < superpage_offset)
 		*addr = (*addr & ~L3_PAGE_MASK) + superpage_offset;
 	else
 		*addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset;
 }
 
 static void *
 mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr)
 {
 	vm_offset_t va, tmpva, ppa, offset;
 
 	ppa = trunc_page(pa);
 	offset = pa & PAGE_MASK;
 	size = roundup2(offset + size, PAGE_SIZE);
 	if (pa < powerpc_ptob(Maxmem))
 		panic("bad pa: %#lx less than Maxmem %#lx\n",
 			  pa, powerpc_ptob(Maxmem));
 	va = kva_alloc(size);
 	if (bootverbose)
 		printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr);
 	KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr));
 
 	if (!va)
 		panic("%s: Couldn't alloc kernel virtual memory", __func__);
 
 	for (tmpva = va; size > 0;) {
 		mmu_radix_kenter_attr(tmpva, ppa, attr);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		ppa += PAGE_SIZE;
 	}
 	ptesync();
 
 	return ((void *)(va + offset));
 }
 
 static void *
 mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
 
 	return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT));
 }
 
 void
 mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma);
 	m->md.mdpg_cache_attrs = ma;
 
 	/*
 	 * If "m" is a normal page, update its direct mapping.  This update
 	 * can be relied upon to perform any cache operations that are
 	 * required for data coherence.
 	 */
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)),
 	    PAGE_SIZE, m->md.mdpg_cache_attrs))
 		panic("memory attribute change on the direct map failed");
 }
 
 static void
 mmu_radix_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t offset;
 
 	CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size);
 	/* If we gave a direct map region in pmap_mapdev, do nothing */
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 		return;
 
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 	va = trunc_page(va);
 
 	if (pmap_initialized) {
 		mmu_radix_qremove(va, atop(size));
 		kva_free(va, size);
 	}
 }
 
 static __inline void
 pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask)
 {
 	uint64_t opte, npte;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PTE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opte = be64toh(*pte);
 		npte = opte & ~mask;
 		npte |= cache_bits;
 	} while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte)));
 }
 
 /*
  * Tries to demote a 1GB page mapping.
  */
 static boolean_t
 pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va)
 {
 	pml2_entry_t oldpdpe;
 	pml3_entry_t *firstpde, newpde, *pde;
 	vm_paddr_t pdpgpa;
 	vm_page_t pdpg;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpdpe = be64toh(*l2e);
 	KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
 	pdpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
 	if (pdpg == NULL) {
 		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
 	}
 	pdpg->pindex = va >> L2_PAGE_SIZE_SHIFT;
 	pdpgpa = VM_PAGE_TO_PHYS(pdpg);
 	firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa);
 	KASSERT((oldpdpe & PG_A) != 0,
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
 	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
 	newpde = oldpdpe;
 
 	/*
 	 * Initialize the page directory page.
 	 */
 	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
 		*pde = htobe64(newpde);
 		newpde += L3_PAGE_SIZE;
 	}
 
 	/*
 	 * Demote the mapping.
 	 */
 	pde_store(l2e, pdpgpa);
 
 	/*
 	 * Flush PWC --- XXX revisit
 	 */
 	pmap_invalidate_all(pmap);
 
 	counter_u64_add(pmap_l2e_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 vm_paddr_t
 mmu_radix_kextract(vm_offset_t va)
 {
 	pml3_entry_t l3e;
 	vm_paddr_t pa;
 
 	CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else {
 		/* Big-endian PTE on stack */
 		l3e = *pmap_pml3e(kernel_pmap, va);
 		if (be64toh(l3e) & RPTE_LEAF) {
 			pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
 			pa |= (va & L3_PAGE_MASK);
 		} else {
 			/*
 			 * Beware of a concurrent promotion that changes the
 			 * PDE at this point!  For example, vtopte() must not
 			 * be used to access the PTE because it would use the
 			 * new PDE.  It is, however, safe to use the old PDE
 			 * because the page table page is preserved by the
 			 * promotion.
 			 */
 			pa = be64toh(*pmap_l3e_to_pte(&l3e, va));
 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 			pa |= (va & PAGE_MASK);
 		}
 	}
 	return (pa);
 }
 
 static pt_entry_t
 mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
 {
 
 	if (ma != VM_MEMATTR_DEFAULT) {
 		return pmap_cache_bits(ma);
 	}
 
 	/*
 	 * Assume the page is cache inhibited and access is guarded unless
 	 * it's in our available memory array.
 	 */
 	for (int i = 0; i < pregions_sz; i++) {
 		if ((pa >= pregions[i].mr_start) &&
 		    (pa < (pregions[i].mr_start + pregions[i].mr_size)))
 			return (RPTE_ATTR_MEM);
 	}
 	return (RPTE_ATTR_GUARDEDIO);
 }
 
 static void
 mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
 {
 	pt_entry_t *pte, pteval;
 	uint64_t cache_bits;
 
 	pte = kvtopte(va);
 	MPASS(pte != NULL);
 	pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
 	cache_bits = mmu_radix_calc_wimg(pa, ma);
 	pte_store(pte, pteval | cache_bits);
 }
 
 void
 mmu_radix_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
 
 	pte = kvtopte(va);
 	pte_clear(pte);
 }
 
 int
 mmu_radix_decode_kernel_ptr(vm_offset_t addr,
     int *is_user, vm_offset_t *decoded)
 {
 
 	CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr);
 	*decoded = addr;
 	*is_user = (addr < VM_MAXUSER_ADDRESS);
 	return (0);
 }
 
 static boolean_t
 mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
 {
 
 	CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
 	return (mem_valid(pa, size));
 }
 
 static void
 mmu_radix_scan_init()
 {
 
 	CTR1(KTR_PMAP, "%s()", __func__);
 	UNIMPLEMENTED();
 }
 
 static void
 mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz,
 	void **va)
 {
 	CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va);
 	UNIMPLEMENTED();
 }
 
 vm_offset_t
 mmu_radix_quick_enter_page(vm_page_t m)
 {
 	vm_paddr_t paddr;
 
 	CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 	paddr = VM_PAGE_TO_PHYS(m);
 	return (PHYS_TO_DMAP(paddr));
 }
 
 void
 mmu_radix_quick_remove_page(vm_offset_t addr __unused)
 {
 	/* no work to do here */
 	CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
 }
 
 static void
 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 {
 	cpu_flush_dcache((void *)sva, eva - sva);
 }
 
 int
 mmu_radix_change_attr(vm_offset_t va, vm_size_t size,
     vm_memattr_t mode)
 {
 	int error;
 
 	CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode);
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_attr_locked(va, size, mode, true);
 	PMAP_UNLOCK(kernel_pmap);
 	return (error);
 }
 
 static int
 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush)
 {
 	vm_offset_t base, offset, tmpva;
 	vm_paddr_t pa_start, pa_end, pa_end1;
 	pml2_entry_t *l2e;
 	pml3_entry_t *l3e;
 	pt_entry_t *pte;
 	int cache_bits, error;
 	boolean_t changed;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	/*
 	 * Only supported on kernel virtual addresses, including the direct
 	 * map but excluding the recursive map.
 	 */
 	if (base < DMAP_MIN_ADDRESS)
 		return (EINVAL);
 
 	cache_bits = pmap_cache_bits(mode);
 	changed = FALSE;
 
 	/*
 	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
 	 * into 4KB pages if required.
 	 */
 	for (tmpva = base; tmpva < base + size; ) {
 		l2e = pmap_pml2e(kernel_pmap, tmpva);
 		if (l2e == NULL || *l2e == 0)
 			return (EINVAL);
 		if (be64toh(*l2e) & RPTE_LEAF) {
 			/*
 			 * If the current 1GB page already has the required
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 1GB page frame.
 			 */
 			if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) {
 				tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 1GB page frame
 			 * and there is at least 1GB left within the range, then
 			 * we need not break down this page into 2MB pages.
 			 */
 			if ((tmpva & L2_PAGE_MASK) == 0 &&
 			    tmpva + L2_PAGE_MASK < base + size) {
 				tmpva += L2_PAGE_MASK;
 				continue;
 			}
 			if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva))
 				return (ENOMEM);
 		}
 		l3e = pmap_l2e_to_l3e(l2e, tmpva);
 		KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n",
 		    tmpva, l2e));
 		if (*l3e == 0)
 			return (EINVAL);
 		if (be64toh(*l3e) & RPTE_LEAF) {
 			/*
 			 * If the current 2MB page already has the required
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 2MB page frame.
 			 */
 			if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) {
 				tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 2MB page frame
 			 * and there is at least 2MB left within the range, then
 			 * we need not break down this page into 4KB pages.
 			 */
 			if ((tmpva & L3_PAGE_MASK) == 0 &&
 			    tmpva + L3_PAGE_MASK < base + size) {
 				tmpva += L3_PAGE_SIZE;
 				continue;
 			}
 			if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva))
 				return (ENOMEM);
 		}
 		pte = pmap_l3e_to_pte(l3e, tmpva);
 		if (*pte == 0)
 			return (EINVAL);
 		tmpva += PAGE_SIZE;
 	}
 	error = 0;
 
 	/*
 	 * Ok, all the pages exist, so run through them updating their
 	 * cache mode if required.
 	 */
 	pa_start = pa_end = 0;
 	for (tmpva = base; tmpva < base + size; ) {
 		l2e = pmap_pml2e(kernel_pmap, tmpva);
 		if (be64toh(*l2e) & RPTE_LEAF) {
 			if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) {
 				pmap_pte_attr(l2e, cache_bits,
 				    RPTE_ATTR_MASK);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*l2e & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = be64toh(*l2e) & PG_PS_FRAME;
 					pa_end = pa_start + L2_PAGE_SIZE;
 				} else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME))
 					pa_end += L2_PAGE_SIZE;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode, flush);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = be64toh(*l2e) & PG_PS_FRAME;
 					pa_end = pa_start + L2_PAGE_SIZE;
 				}
 			}
 			tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
 			continue;
 		}
 		l3e = pmap_l2e_to_l3e(l2e, tmpva);
 		if (be64toh(*l3e) & RPTE_LEAF) {
 			if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) {
 				pmap_pte_attr(l3e, cache_bits,
 				    RPTE_ATTR_MASK);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = be64toh(*l3e) & PG_PS_FRAME;
 					pa_end = pa_start + L3_PAGE_SIZE;
 				} else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME))
 					pa_end += L3_PAGE_SIZE;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode, flush);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = be64toh(*l3e) & PG_PS_FRAME;
 					pa_end = pa_start + L3_PAGE_SIZE;
 				}
 			}
 			tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
 		} else {
 			pte = pmap_l3e_to_pte(l3e, tmpva);
 			if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) {
 				pmap_pte_attr(pte, cache_bits,
 				    RPTE_ATTR_MASK);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (be64toh(*pte) & PG_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = be64toh(*pte) & PG_FRAME;
 					pa_end = pa_start + PAGE_SIZE;
 				} else if (pa_end == (be64toh(*pte) & PG_FRAME))
 					pa_end += PAGE_SIZE;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode, flush);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = be64toh(*pte) & PG_FRAME;
 					pa_end = pa_start + PAGE_SIZE;
 				}
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
 		pa_end1 = MIN(pa_end, dmaplimit);
 		if (pa_start != pa_end1)
 			error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
 			    pa_end1 - pa_start, mode, flush);
 	}
 
 	/*
 	 * Flush CPU caches if required to make sure any data isn't cached that
 	 * shouldn't be, etc.
 	 */
 	if (changed) {
 		pmap_invalidate_all(kernel_pmap);
 
 		if (flush)
 			pmap_invalidate_cache_range(base, tmpva);
 	}
 	return (error);
 }
 
 /*
  * Allocate physical memory for the vm_page array and map it into KVA,
  * attempting to back the vm_pages with domain-local memory.
  */
 void
 mmu_radix_page_array_startup(long pages)
 {
 #ifdef notyet
 	pml2_entry_t *l2e;
 	pml3_entry_t *pde;
 	pml3_entry_t newl3;
 	vm_offset_t va;
 	long pfn;
 	int domain, i;
 #endif
 	vm_paddr_t pa;
 	vm_offset_t start, end;
 
 	vm_page_array_size = pages;
 
 	start = VM_MIN_KERNEL_ADDRESS;
 	end = start + pages * sizeof(struct vm_page);
 
 	pa = vm_phys_early_alloc(0, end - start);
 
 	start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT);
 #ifdef notyet
 	/* TODO: NUMA vm_page_array.  Blocked out until then (copied from amd64). */
 	for (va = start; va < end; va += L3_PAGE_SIZE) {
 		pfn = first_page + (va - start) / sizeof(struct vm_page);
 		domain = vm_phys_domain(ptoa(pfn));
 		l2e = pmap_pml2e(kernel_pmap, va);
 		if ((be64toh(*l2e) & PG_V) == 0) {
 			pa = vm_phys_early_alloc(domain, PAGE_SIZE);
 			dump_add_page(pa);
 			pagezero(PHYS_TO_DMAP(pa));
 			pde_store(l2e, (pml2_entry_t)pa);
 		}
 		pde = pmap_l2e_to_l3e(l2e, va);
 		if ((be64toh(*pde) & PG_V) != 0)
 			panic("Unexpected pde %p", pde);
 		pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE);
 		for (i = 0; i < NPDEPG; i++)
 			dump_add_page(pa + i * PAGE_SIZE);
 		newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W);
 		pte_store(pde, newl3);
 	}
 #endif
 	vm_page_array = (vm_page_t)start;
 }
 
 #ifdef DDB
 #include <sys/kdb.h>
 #include <ddb/ddb.h>
 
 static void
 pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va)
 {
 	pml1_entry_t *l1e;
 	pml2_entry_t *l2e;
 	pml3_entry_t *l3e;
 	pt_entry_t *pte;
 
 	l1e = &l1[pmap_pml1e_index(va)];
 	db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e));
 	if ((be64toh(*l1e) & PG_V) == 0) {
 		db_printf("\n");
 		return;
 	}
 	l2e = pmap_l1e_to_l2e(l1e, va);
 	db_printf(" l2e %#016lx", be64toh(*l2e));
 	if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) {
 		db_printf("\n");
 		return;
 	}
 	l3e = pmap_l2e_to_l3e(l2e, va);
 	db_printf(" l3e %#016lx", be64toh(*l3e));
 	if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) {
 		db_printf("\n");
 		return;
 	}
 	pte = pmap_l3e_to_pte(l3e, va);
 	db_printf(" pte %#016lx\n", be64toh(*pte));
 }
 
 void
 pmap_page_print_mappings(vm_page_t m)
 {
 	pmap_t pmap;
 	pv_entry_t pv;
 
 	db_printf("page %p(%lx)\n", m, m->phys_addr);
 	/* need to elide locks if running in ddb */
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 		db_printf("pv: %p ", pv);
 		db_printf("va: %#016lx ", pv->pv_va);
 		pmap = PV_PMAP(pv);
 		db_printf("pmap %p  ", pmap);
 		if (pmap != NULL) {
 			db_printf("asid: %lu\n", pmap->pm_pid);
 			pmap_pte_walk(pmap->pm_pml1, pv->pv_va);
 		}
 	}
 }
 
 DB_SHOW_COMMAND(pte, pmap_print_pte)
 {
 	vm_offset_t va;
 	pmap_t pmap;
 
 	if (!have_addr) {
 		db_printf("show pte addr\n");
 		return;
 	}
 	va = (vm_offset_t)addr;
 
 	if (va >= DMAP_MIN_ADDRESS)
 		pmap = kernel_pmap;
 	else if (kdb_thread != NULL)
 		pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
 	else
 		pmap = vmspace_pmap(curthread->td_proc->p_vmspace);
 
 	pmap_pte_walk(pmap->pm_pml1, va);
 }
 
 #endif
diff --git a/sys/powerpc/aim/slb.c b/sys/powerpc/aim/slb.c
index 886e11c9b356..c107fcbcfc43 100644
--- a/sys/powerpc/aim/slb.c
+++ b/sys/powerpc/aim/slb.c
@@ -1,629 +1,626 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Nathan Whitehorn
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 
 #include <machine/md_var.h>
 #include <machine/platform.h>
 #include <machine/vmparam.h>
 #include <machine/trap.h>
 
 #include "mmu_oea64.h"
 
 uintptr_t moea64_get_unique_vsid(void);
 void moea64_release_vsid(uint64_t vsid);
 static void slb_zone_init(void *);
 
 static uma_zone_t slbt_zone;
 static uma_zone_t slb_cache_zone;
 int n_slbs = 64;
 
 SYSINIT(slb_zone_init, SI_SUB_KMEM, SI_ORDER_ANY, slb_zone_init, NULL);
 
 struct slbtnode {
 	uint16_t	ua_alloc;
 	uint8_t		ua_level;
 	/* Only 36 bits needed for full 64-bit address space. */
 	uint64_t	ua_base;
 	union {
 		struct slbtnode	*ua_child[16];
 		struct slb	slb_entries[16];
 	} u;
 };
 
 /*
  * For a full 64-bit address space, there are 36 bits in play in an
  * esid, so 8 levels, with the leaf being at level 0.
  *
  * |3333|3322|2222|2222|1111|1111|11  |    |    |  esid
  * |5432|1098|7654|3210|9876|5432|1098|7654|3210|  bits
  * +----+----+----+----+----+----+----+----+----+--------
  * | 8  | 7  | 6  | 5  | 4  | 3  | 2  | 1  | 0  | level
  */
 #define UAD_ROOT_LEVEL  8
 #define UAD_LEAF_LEVEL  0
 
 static inline int
 esid2idx(uint64_t esid, int level)
 {
 	int shift;
 
 	shift = level * 4;
 	return ((esid >> shift) & 0xF);
 }
 
 /*
  * The ua_base field should have 0 bits after the first 4*(level+1)
  * bits; i.e. only
  */
 #define uad_baseok(ua)                          \
 	(esid2base(ua->ua_base, ua->ua_level) == ua->ua_base)
 
 static inline uint64_t
 esid2base(uint64_t esid, int level)
 {
 	uint64_t mask;
 	int shift;
 
 	shift = (level + 1) * 4;
 	mask = ~((1ULL << shift) - 1);
 	return (esid & mask);
 }
 
 /*
  * Allocate a new leaf node for the specified esid/vmhandle from the
  * parent node.
  */
 static struct slb *
 make_new_leaf(uint64_t esid, uint64_t slbv, struct slbtnode *parent)
 {
 	struct slbtnode *child;
 	struct slb *retval;
 	int idx;
 
 	idx = esid2idx(esid, parent->ua_level);
 	KASSERT(parent->u.ua_child[idx] == NULL, ("Child already exists!"));
 
 	/* unlock and M_WAITOK and loop? */
 	child = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO);
 	KASSERT(child != NULL, ("unhandled NULL case"));
 
 	child->ua_level = UAD_LEAF_LEVEL;
 	child->ua_base = esid2base(esid, child->ua_level);
 	idx = esid2idx(esid, child->ua_level);
 	child->u.slb_entries[idx].slbv = slbv;
 	child->u.slb_entries[idx].slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID;
 	setbit(&child->ua_alloc, idx);
 
 	retval = &child->u.slb_entries[idx];
 
 	/*
 	 * The above stores must be visible before the next one, so
 	 * that a lockless searcher always sees a valid path through
 	 * the tree.
 	 */
 	powerpc_lwsync();
 
 	idx = esid2idx(esid, parent->ua_level);
 	parent->u.ua_child[idx] = child;
 	setbit(&parent->ua_alloc, idx);
 
 	return (retval);
 }
 
 /*
  * Allocate a new intermediate node to fit between the parent and
  * esid.
  */
 static struct slbtnode*
 make_intermediate(uint64_t esid, struct slbtnode *parent)
 {
 	struct slbtnode *child, *inter;
 	int idx, level;
 
 	idx = esid2idx(esid, parent->ua_level);
 	child = parent->u.ua_child[idx];
 	KASSERT(esid2base(esid, child->ua_level) != child->ua_base,
 	    ("No need for an intermediate node?"));
 
 	/*
 	 * Find the level where the existing child and our new esid
 	 * meet.  It must be lower than parent->ua_level or we would
 	 * have chosen a different index in parent.
 	 */
 	level = child->ua_level + 1;
 	while (esid2base(esid, level) !=
 	    esid2base(child->ua_base, level))
 		level++;
 	KASSERT(level < parent->ua_level,
 	    ("Found splitting level %d for %09jx and %09jx, "
 	    "but it's the same as %p's",
 	    level, esid, child->ua_base, parent));
 
 	/* unlock and M_WAITOK and loop? */
 	inter = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO);
 	KASSERT(inter != NULL, ("unhandled NULL case"));
 
 	/* Set up intermediate node to point to child ... */
 	inter->ua_level = level;
 	inter->ua_base = esid2base(esid, inter->ua_level);
 	idx = esid2idx(child->ua_base, inter->ua_level);
 	inter->u.ua_child[idx] = child;
 	setbit(&inter->ua_alloc, idx);
 	powerpc_lwsync();
 
 	/* Set up parent to point to intermediate node ... */
 	idx = esid2idx(inter->ua_base, parent->ua_level);
 	parent->u.ua_child[idx] = inter;
 	setbit(&parent->ua_alloc, idx);
 
 	return (inter);
 }
 
 uint64_t
 kernel_va_to_slbv(vm_offset_t va)
 {
 	uint64_t slbv;
 
 	/* Set kernel VSID to deterministic value */
 	slbv = (KERNEL_VSID((uintptr_t)va >> ADDR_SR_SHFT)) << SLBV_VSID_SHIFT;
 
 	/* 
 	 * Figure out if this is a large-page mapping.
 	 */
 	if (hw_direct_map && va > DMAP_BASE_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		/*
 		 * XXX: If we have set up a direct map, assumes
 		 * all physical memory is mapped with large pages.
 		 */
 
 		if (mem_valid(DMAP_TO_PHYS(va), 0) == 0)
 			slbv |= SLBV_L;
 	} else if (moea64_large_page_size != 0 &&
 	    va >= (vm_offset_t)vm_page_array &&
 	    va <= (uintptr_t)(&vm_page_array[vm_page_array_size]))
 		slbv |= SLBV_L;
 		
 	return (slbv);
 }
 
 struct slb *
 user_va_to_slb_entry(pmap_t pm, vm_offset_t va)
 {
 	uint64_t esid = va >> ADDR_SR_SHFT;
 	struct slbtnode *ua;
 	int idx;
 
 	ua = pm->pm_slb_tree_root;
 
 	for (;;) {
 		KASSERT(uad_baseok(ua), ("uad base %016jx level %d bad!",
 		    ua->ua_base, ua->ua_level));
 		idx = esid2idx(esid, ua->ua_level);
 
 		/*
 		 * This code is specific to ppc64 where a load is
 		 * atomic, so no need for atomic_load macro.
 		 */
 		if (ua->ua_level == UAD_LEAF_LEVEL)
 			return ((ua->u.slb_entries[idx].slbe & SLBE_VALID) ?
 			    &ua->u.slb_entries[idx] : NULL);
 
 		/*
 		 * The following accesses are implicitly ordered under the POWER
 		 * ISA by load dependencies (the store ordering is provided by
 		 * the powerpc_lwsync() calls elsewhere) and so are run without
 		 * barriers.
 		 */
 		ua = ua->u.ua_child[idx];
 		if (ua == NULL ||
 		    esid2base(esid, ua->ua_level) != ua->ua_base)
 			return (NULL);
 	}
 
 	return (NULL);
 }
 
 uint64_t
 va_to_vsid(pmap_t pm, vm_offset_t va)
 {
 	struct slb *entry;
 
 	/* Shortcut kernel case */
 	if (pm == kernel_pmap)
 		return (KERNEL_VSID((uintptr_t)va >> ADDR_SR_SHFT));
 
 	/*
 	 * If there is no vsid for this VA, we need to add a new entry
 	 * to the PMAP's segment table.
 	 */
 
 	entry = user_va_to_slb_entry(pm, va);
 
 	if (entry == NULL)
 		return (allocate_user_vsid(pm,
 		    (uintptr_t)va >> ADDR_SR_SHFT, 0));
 
 	return ((entry->slbv & SLBV_VSID_MASK) >> SLBV_VSID_SHIFT);
 }
 
 uint64_t
 allocate_user_vsid(pmap_t pm, uint64_t esid, int large)
 {
 	uint64_t vsid, slbv;
 	struct slbtnode *ua, *next, *inter;
 	struct slb *slb;
 	int idx;
 
 	KASSERT(pm != kernel_pmap, ("Attempting to allocate a kernel VSID"));
 
 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
 	vsid = moea64_get_unique_vsid();
 
 	slbv = vsid << SLBV_VSID_SHIFT;
 	if (large)
 		slbv |= SLBV_L;
 
 	ua = pm->pm_slb_tree_root;
 
 	/* Descend to the correct leaf or NULL pointer. */
 	for (;;) {
 		KASSERT(uad_baseok(ua),
 		   ("uad base %09jx level %d bad!", ua->ua_base, ua->ua_level));
 		idx = esid2idx(esid, ua->ua_level);
 
 		if (ua->ua_level == UAD_LEAF_LEVEL) {
 			ua->u.slb_entries[idx].slbv = slbv;
 			eieio();
 			ua->u.slb_entries[idx].slbe = (esid << SLBE_ESID_SHIFT)
 			    | SLBE_VALID;
 			setbit(&ua->ua_alloc, idx);
 			slb = &ua->u.slb_entries[idx];
 			break;
 		}
 
 		next = ua->u.ua_child[idx];
 		if (next == NULL) {
 			slb = make_new_leaf(esid, slbv, ua);
 			break;
                 }
 
 		/*
 		 * Check if the next item down has an okay ua_base.
 		 * If not, we need to allocate an intermediate node.
 		 */
 		if (esid2base(esid, next->ua_level) != next->ua_base) {
 			inter = make_intermediate(esid, ua);
 			slb = make_new_leaf(esid, slbv, inter);
 			break;
 		}
 
 		ua = next;
 	}
 
 	/*
 	 * Someone probably wants this soon, and it may be a wired
 	 * SLB mapping, so pre-spill this entry.
 	 */
 	eieio();
 	slb_insert_user(pm, slb);
 
 	return (vsid);
 }
 
 void
 free_vsid(pmap_t pm, uint64_t esid, int large)
 {
 	struct slbtnode *ua;
 	int idx;
 
 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
 
 	ua = pm->pm_slb_tree_root;
 	/* Descend to the correct leaf. */
 	for (;;) {
 		KASSERT(uad_baseok(ua),
 		   ("uad base %09jx level %d bad!", ua->ua_base, ua->ua_level));
 		
 		idx = esid2idx(esid, ua->ua_level);
 		if (ua->ua_level == UAD_LEAF_LEVEL) {
 			ua->u.slb_entries[idx].slbv = 0;
 			eieio();
 			ua->u.slb_entries[idx].slbe = 0;
 			clrbit(&ua->ua_alloc, idx);
 			return;
 		}
 
 		ua = ua->u.ua_child[idx];
 		if (ua == NULL ||
 		    esid2base(esid, ua->ua_level) != ua->ua_base) {
 			/* Perhaps just return instead of assert? */
 			KASSERT(0,
 			    ("Asked to remove an entry that was never inserted!"));
 			return;
 		}
 	}
 }
 
 static void
 free_slb_tree_node(struct slbtnode *ua)
 {
 	int idx;
 
 	for (idx = 0; idx < 16; idx++) {
 		if (ua->ua_level != UAD_LEAF_LEVEL) {
 			if (ua->u.ua_child[idx] != NULL)
 				free_slb_tree_node(ua->u.ua_child[idx]);
 		} else {
 			if (ua->u.slb_entries[idx].slbv != 0)
 				moea64_release_vsid(ua->u.slb_entries[idx].slbv
 				    >> SLBV_VSID_SHIFT);
 		}
 	}
 
 	uma_zfree(slbt_zone, ua);
 }
 
 void
 slb_free_tree(pmap_t pm)
 {
 
 	free_slb_tree_node(pm->pm_slb_tree_root);
 }
 
 struct slbtnode *
 slb_alloc_tree(void)
 {
 	struct slbtnode *root;
 
 	root = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO);
 	KASSERT(root != NULL, ("unhandled NULL case"));
 	root->ua_level = UAD_ROOT_LEVEL;
 
 	return (root);
 }
 
 /* Lock entries mapping kernel text and stacks */
 
 void
 slb_insert_kernel(uint64_t slbe, uint64_t slbv)
 {
 	struct slb *slbcache;
 	int i;
 
 	/* We don't want to be preempted while modifying the kernel map */
 	critical_enter();
 
 	slbcache = PCPU_GET(aim.slb);
 
 	/* Check for an unused slot, abusing the user slot as a full flag */
 	if (slbcache[USER_SLB_SLOT].slbe == 0) {
 		for (i = 0; i < n_slbs; i++) {
 			if (i == USER_SLB_SLOT)
 				continue;
 			if (!(slbcache[i].slbe & SLBE_VALID)) 
 				goto fillkernslb;
 		}
 
 		if (i == n_slbs)
 			slbcache[USER_SLB_SLOT].slbe = 1;
 	}
 
 	i = mftb() % n_slbs;
 	if (i == USER_SLB_SLOT)
 			i = (i+1) % n_slbs;
 
 fillkernslb:
 	KASSERT(i != USER_SLB_SLOT,
 	    ("Filling user SLB slot with a kernel mapping"));
 	slbcache[i].slbv = slbv;
 	slbcache[i].slbe = slbe | (uint64_t)i;
 
 	/* If it is for this CPU, put it in the SLB right away */
 	if (pmap_bootstrapped) {
 		/* slbie not required */
 		__asm __volatile ("slbmte %0, %1" :: 
 		    "r"(slbcache[i].slbv), "r"(slbcache[i].slbe)); 
 	}
 
 	critical_exit();
 }
 
 void
 slb_insert_user(pmap_t pm, struct slb *slb)
 {
 	int i;
 
 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
 
 	if (pm->pm_slb_len < n_slbs) {
 		i = pm->pm_slb_len;
 		pm->pm_slb_len++;
 	} else {
 		i = mftb() % n_slbs;
 	}
 
 	/* Note that this replacement is atomic with respect to trap_subr */
 	pm->pm_slb[i] = slb;
 }
 
 static void *
 slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
     u_int8_t *flags, int wait)
 {
 	static vm_offset_t realmax = 0;
 	void *va;
 	vm_page_t m;
 
 	if (realmax == 0)
 		realmax = platform_real_maxaddr();
 
 	*flags = UMA_SLAB_PRIV;
-	m = vm_page_alloc_contig_domain(NULL, 0, domain,
-	    malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED,
-	    1, 0, realmax, PAGE_SIZE, PAGE_SIZE, VM_MEMATTR_DEFAULT);
+	m = vm_page_alloc_noobj_contig_domain(domain, malloc2vm_flags(wait) |
+	    VM_ALLOC_WIRED, 1, 0, realmax, PAGE_SIZE, PAGE_SIZE,
+	    VM_MEMATTR_DEFAULT);
 	if (m == NULL)
 		return (NULL);
 
 	if (hw_direct_map)
 		va = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	else {
 		va = (void *)(VM_PAGE_TO_PHYS(m) | DMAP_BASE_ADDRESS);
 		pmap_kenter((vm_offset_t)va, VM_PAGE_TO_PHYS(m));
 	}
 
-	if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
-		bzero(va, PAGE_SIZE);
-
 	return (va);
 }
 
 static void
 slb_zone_init(void *dummy)
 {
 	slbt_zone = uma_zcreate("SLB tree node", sizeof(struct slbtnode),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_CONTIG | UMA_ZONE_VM);
 	slb_cache_zone = uma_zcreate("SLB cache",
 	    (n_slbs + 1)*sizeof(struct slb *), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG | UMA_ZONE_VM);
 
 	if (platform_real_maxaddr() != VM_MAX_ADDRESS) {
 		uma_zone_set_allocf(slb_cache_zone, slb_uma_real_alloc);
 		uma_zone_set_allocf(slbt_zone, slb_uma_real_alloc);
 	}
 }
 
 struct slb **
 slb_alloc_user_cache(void)
 {
 	return (uma_zalloc(slb_cache_zone, M_ZERO));
 }
 
 void
 slb_free_user_cache(struct slb **slb)
 {
 	uma_zfree(slb_cache_zone, slb);
 }
 
 /* Handle kernel SLB faults -- runs in real mode, all seat belts off */
 void
 handle_kernel_slb_spill(int type, register_t dar, register_t srr0)
 {
 	struct slb *slbcache;
 	uint64_t slbe, slbv;
 	uint64_t esid, addr;
 	int i;
 
 	addr = (type == EXC_ISE) ? srr0 : dar;
 	slbcache = PCPU_GET(aim.slb);
 	esid = (uintptr_t)addr >> ADDR_SR_SHFT;
 	slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID;
 
 	/* See if the hardware flushed this somehow (can happen in LPARs) */
 	for (i = 0; i < n_slbs; i++)
 		if (slbcache[i].slbe == (slbe | (uint64_t)i))
 			return;
 
 	/* Not in the map, needs to actually be added */
 	slbv = kernel_va_to_slbv(addr);
 	if (slbcache[USER_SLB_SLOT].slbe == 0) {
 		for (i = 0; i < n_slbs; i++) {
 			if (i == USER_SLB_SLOT)
 				continue;
 			if (!(slbcache[i].slbe & SLBE_VALID))
 				goto fillkernslb;
 		}
 
 		if (i == n_slbs)
 			slbcache[USER_SLB_SLOT].slbe = 1;
 	}
 
 	/* Sacrifice a random SLB entry that is not the user entry */
 	i = mftb() % n_slbs;
 	if (i == USER_SLB_SLOT)
 		i = (i+1) % n_slbs;
 
 fillkernslb:
 	/* Write new entry */
 	slbcache[i].slbv = slbv;
 	slbcache[i].slbe = slbe | (uint64_t)i;
 
 	/* Trap handler will restore from cache on exit */
 }
 
 int 
 handle_user_slb_spill(pmap_t pm, vm_offset_t addr)
 {
 	struct slb *user_entry;
 	uint64_t esid;
 	int i;
 
 	if (pm->pm_slb == NULL)
 		return (-1);
 
 	esid = (uintptr_t)addr >> ADDR_SR_SHFT;
 
 	PMAP_LOCK(pm);
 	user_entry = user_va_to_slb_entry(pm, addr);
 
 	if (user_entry == NULL) {
 		/* allocate_vsid auto-spills it */
 		(void)allocate_user_vsid(pm, esid, 0);
 	} else {
 		/*
 		 * Check that another CPU has not already mapped this.
 		 * XXX: Per-thread SLB caches would be better.
 		 */
 		for (i = 0; i < pm->pm_slb_len; i++)
 			if (pm->pm_slb[i] == user_entry)
 				break;
 
 		if (i == pm->pm_slb_len)
 			slb_insert_user(pm, user_entry);
 	}
 	PMAP_UNLOCK(pm);
 
 	return (0);
 }
diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c
index e1ff056117eb..9abf75a731f5 100644
--- a/sys/riscv/riscv/pmap.c
+++ b/sys/riscv/riscv/pmap.c
@@ -1,4776 +1,4774 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  * Copyright (c) 2014 Andrew Turner
  * All rights reserved.
  * Copyright (c) 2014 The FreeBSD Foundation
  * All rights reserved.
  * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Portions of this software were developed by Andrew Turner under
  * sponsorship from The FreeBSD Foundation.
  *
  * Portions of this software were developed by SRI International and the
  * University of Cambridge Computer Laboratory under DARPA/AFRL contract
  * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Portions of this software were developed by the University of Cambridge
  * Computer Laboratory as part of the CTSRD Project, with support from the
  * UK Higher Education Innovation Fund (HEIF).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/physmem.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sx.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/vm_dumpset.h>
 #include <vm/uma.h>
 
 #include <machine/machdep.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/sbi.h>
 
 #define	NUL1E		(Ln_ENTRIES * Ln_ENTRIES)
 #define	NUL2E		(Ln_ENTRIES * NUL1E)
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
 #define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
 	struct rwlock *_new_lock;			\
 							\
 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
 	if (_new_lock != *_lockp) {			\
 		if (*_lockp != NULL)			\
 			rw_wunlock(*_lockp);		\
 		*_lockp = _new_lock;			\
 		rw_wlock(*_lockp);			\
 	}						\
 } while (0)
 
 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 
 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
 	struct rwlock **_lockp = (lockp);		\
 							\
 	if (*_lockp != NULL) {				\
 		rw_wunlock(*_lockp);			\
 		*_lockp = NULL;				\
 	}						\
 } while (0)
 
 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 
 /* The list of all the user pmaps */
 LIST_HEAD(pmaplist, pmap);
 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER();
 
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 vm_offset_t kernel_vm_end = 0;
 
 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
 
 /* This code assumes all L1 DMAP entries will be used */
 CTASSERT((DMAP_MIN_ADDRESS  & ~L1_OFFSET) == DMAP_MIN_ADDRESS);
 CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_ADDRESS);
 
 static struct rwlock_padalign pvh_global_lock;
 static struct mtx_padalign allpmaps_lock;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "VM/pmap parameters");
 
 static int superpages_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
     CTLFLAG_RDTUN, &superpages_enabled, 0,
     "Enable support for transparent superpages");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "2MB page mapping counters");
 
 static u_long pmap_l2_demotions;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_l2_demotions, 0,
     "2MB page demotions");
 
 static u_long pmap_l2_mappings;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_l2_mappings, 0,
     "2MB page mappings");
 
 static u_long pmap_l2_p_failures;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_l2_p_failures, 0,
     "2MB page promotion failures");
 
 static u_long pmap_l2_promotions;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_l2_promotions, 0,
     "2MB page promotions");
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx pv_chunks_mutex;
 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 static struct md_page pv_dummy;
 
 extern cpuset_t all_harts;
 
 /*
  * Internal flags for pmap_enter()'s helper functions.
  */
 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 static bool	pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
 static bool	pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
 		    vm_offset_t va, struct rwlock **lockp);
 static int	pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
 		    u_int flags, vm_page_t m, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m, struct rwlock **lockp);
 
 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
 
 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 
 #define	pmap_clear(pte)			pmap_store(pte, 0)
 #define	pmap_clear_bits(pte, bits)	atomic_clear_64(pte, bits)
 #define	pmap_load_store(pte, entry)	atomic_swap_64(pte, entry)
 #define	pmap_load_clear(pte)		pmap_load_store(pte, 0)
 #define	pmap_load(pte)			atomic_load_64(pte)
 #define	pmap_store(pte, entry)		atomic_store_64(pte, entry)
 #define	pmap_store_bits(pte, bits)	atomic_set_64(pte, bits)
 
 /********************/
 /* Inline functions */
 /********************/
 
 static __inline void
 pagecopy(void *s, void *d)
 {
 
 	memcpy(d, s, PAGE_SIZE);
 }
 
 static __inline void
 pagezero(void *p)
 {
 
 	bzero(p, PAGE_SIZE);
 }
 
 #define	pmap_l1_index(va)	(((va) >> L1_SHIFT) & Ln_ADDR_MASK)
 #define	pmap_l2_index(va)	(((va) >> L2_SHIFT) & Ln_ADDR_MASK)
 #define	pmap_l3_index(va)	(((va) >> L3_SHIFT) & Ln_ADDR_MASK)
 
 #define	PTE_TO_PHYS(pte) \
     ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE)
 #define	L2PTE_TO_PHYS(l2) \
     ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
 
 static __inline pd_entry_t *
 pmap_l1(pmap_t pmap, vm_offset_t va)
 {
 
 	KASSERT(VIRT_IS_VALID(va),
 	    ("%s: malformed virtual address %#lx", __func__, va));
 	return (&pmap->pm_l1[pmap_l1_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
 {
 	vm_paddr_t phys;
 	pd_entry_t *l2;
 
 	phys = PTE_TO_PHYS(pmap_load(l1));
 	l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
 
 	return (&l2[pmap_l2_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l2(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l1;
 
 	l1 = pmap_l1(pmap, va);
 	if ((pmap_load(l1) & PTE_V) == 0)
 		return (NULL);
 	if ((pmap_load(l1) & PTE_RX) != 0)
 		return (NULL);
 
 	return (pmap_l1_to_l2(l1, va));
 }
 
 static __inline pt_entry_t *
 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
 {
 	vm_paddr_t phys;
 	pt_entry_t *l3;
 
 	phys = PTE_TO_PHYS(pmap_load(l2));
 	l3 = (pd_entry_t *)PHYS_TO_DMAP(phys);
 
 	return (&l3[pmap_l3_index(va)]);
 }
 
 static __inline pt_entry_t *
 pmap_l3(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l2;
 
 	l2 = pmap_l2(pmap, va);
 	if (l2 == NULL)
 		return (NULL);
 	if ((pmap_load(l2) & PTE_V) == 0)
 		return (NULL);
 	if ((pmap_load(l2) & PTE_RX) != 0)
 		return (NULL);
 
 	return (pmap_l2_to_l3(l2, va));
 }
 
 static __inline void
 pmap_resident_count_inc(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pmap->pm_stats.resident_count += count;
 }
 
 static __inline void
 pmap_resident_count_dec(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(pmap->pm_stats.resident_count >= count,
 	    ("pmap %p resident count underflow %ld %d", pmap,
 	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count -= count;
 }
 
 static void
 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index,
     pt_entry_t entry)
 {
 	struct pmap *user_pmap;
 	pd_entry_t *l1;
 
 	/* Distribute new kernel L1 entry to all the user pmaps */
 	if (pmap != kernel_pmap)
 		return;
 
 	mtx_lock(&allpmaps_lock);
 	LIST_FOREACH(user_pmap, &allpmaps, pm_list) {
 		l1 = &user_pmap->pm_l1[l1index];
 		pmap_store(l1, entry);
 	}
 	mtx_unlock(&allpmaps_lock);
 }
 
 static pt_entry_t *
 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
     u_int *l2_slot)
 {
 	pt_entry_t *l2;
 	pd_entry_t *l1;
 
 	l1 = (pd_entry_t *)l1pt;
 	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
 
 	/* Check locore has used a table L1 map */
 	KASSERT((l1[*l1_slot] & PTE_RX) == 0,
 		("Invalid bootstrap L1 table"));
 
 	/* Find the address of the L2 table */
 	l2 = (pt_entry_t *)init_pt_va;
 	*l2_slot = pmap_l2_index(va);
 
 	return (l2);
 }
 
 static vm_paddr_t
 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
 {
 	u_int l1_slot, l2_slot;
 	pt_entry_t *l2;
 	vm_paddr_t ret;
 
 	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
 
 	/* Check locore has used L2 superpages */
 	KASSERT((l2[l2_slot] & PTE_RX) != 0,
 		("Invalid bootstrap L2 table"));
 
 	/* L2 is superpages */
 	ret = L2PTE_TO_PHYS(l2[l2_slot]);
 	ret += (va & L2_OFFSET);
 
 	return (ret);
 }
 
 static void
 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa)
 {
 	vm_offset_t va;
 	vm_paddr_t pa;
 	pd_entry_t *l1;
 	u_int l1_slot;
 	pt_entry_t entry;
 	pn_t pn;
 
 	pa = dmap_phys_base = min_pa & ~L1_OFFSET;
 	va = DMAP_MIN_ADDRESS;
 	l1 = (pd_entry_t *)kern_l1;
 	l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS);
 
 	for (; va < DMAP_MAX_ADDRESS && pa < max_pa;
 	    pa += L1_SIZE, va += L1_SIZE, l1_slot++) {
 		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
 
 		/* superpages */
 		pn = (pa / PAGE_SIZE);
 		entry = PTE_KERN;
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(&l1[l1_slot], entry);
 	}
 
 	/* Set the upper limit of the DMAP region */
 	dmap_phys_max = pa;
 	dmap_max_addr = va;
 
 	sfence_vma();
 }
 
 static vm_offset_t
 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
 {
 	vm_offset_t l3pt;
 	pt_entry_t entry;
 	pd_entry_t *l2;
 	vm_paddr_t pa;
 	u_int l2_slot;
 	pn_t pn;
 
 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
 
 	l2 = pmap_l2(kernel_pmap, va);
 	l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1));
 	l2_slot = pmap_l2_index(va);
 	l3pt = l3_start;
 
 	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
 		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
 
 		pa = pmap_early_vtophys(l1pt, l3pt);
 		pn = (pa / PAGE_SIZE);
 		entry = (PTE_V);
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(&l2[l2_slot], entry);
 		l3pt += PAGE_SIZE;
 	}
 
 	/* Clean the L2 page table */
 	memset((void *)l3_start, 0, l3pt - l3_start);
 
 	return (l3pt);
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen)
 {
 	u_int l1_slot, l2_slot;
 	vm_offset_t freemempos;
 	vm_offset_t dpcpu, msgbufpv;
 	vm_paddr_t max_pa, min_pa, pa;
 	pt_entry_t *l2p;
 	int i;
 
 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
 
 	/* Set this early so we can use the pagetable walking functions */
 	kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt;
 	PMAP_LOCK_INIT(kernel_pmap);
 
 	rw_init(&pvh_global_lock, "pmap pv global");
 
 	/*
 	 * Set the current CPU as active in the kernel pmap. Secondary cores
 	 * will add themselves later in init_secondary(). The SBI firmware
 	 * may rely on this mask being precise, so CPU_FILL() is not used.
 	 */
 	CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active);
 
 	/* Assume the address we were loaded to is a valid physical address. */
 	min_pa = max_pa = kernstart;
 
 	physmap_idx = physmem_avail(physmap, nitems(physmap));
 	physmap_idx /= 2;
 
 	/*
 	 * Find the minimum physical address. physmap is sorted,
 	 * but may contain empty ranges.
 	 */
 	for (i = 0; i < physmap_idx * 2; i += 2) {
 		if (physmap[i] == physmap[i + 1])
 			continue;
 		if (physmap[i] <= min_pa)
 			min_pa = physmap[i];
 		if (physmap[i + 1] > max_pa)
 			max_pa = physmap[i + 1];
 	}
 	printf("physmap_idx %u\n", physmap_idx);
 	printf("min_pa %lx\n", min_pa);
 	printf("max_pa %lx\n", max_pa);
 
 	/* Create a direct map region early so we can use it for pa -> va */
 	pmap_bootstrap_dmap(l1pt, min_pa, max_pa);
 
 	/*
 	 * Read the page table to find out what is already mapped.
 	 * This assumes we have mapped a block of memory from KERNBASE
 	 * using a single L1 entry.
 	 */
 	(void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
 
 	/* Sanity check the index, KERNBASE should be the first VA */
 	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
 
 	freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE);
 
 	/* Create the l3 tables for the early devmap */
 	freemempos = pmap_bootstrap_l3(l1pt,
 	    VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos);
 
 	/*
 	 * Invalidate the mapping we created for the DTB. At this point a copy
 	 * has been created, and we no longer need it. We want to avoid the
 	 * possibility of an aliased mapping in the future.
 	 */
 	l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS);
 	if ((pmap_load(l2p) & PTE_V) != 0)
 		pmap_clear(l2p);
 
 	sfence_vma();
 
 #define alloc_pages(var, np)						\
 	(var) = freemempos;						\
 	freemempos += (np * PAGE_SIZE);					\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	/* Allocate dynamic per-cpu area. */
 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
 	dpcpu_init((void *)dpcpu, 0);
 
 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
 	msgbufp = (void *)msgbufpv;
 
 	virtual_avail = roundup2(freemempos, L2_SIZE);
 	virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE;
 	kernel_vm_end = virtual_avail;
 
 	pa = pmap_early_vtophys(l1pt, freemempos);
 
 	physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	vm_size_t s;
 	int i, pv_npg;
 
 	/*
 	 * Initialize the pv chunk and pmap list mutexes.
 	 */
 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF);
 
 	/*
 	 * Initialize the pool of pv list locks.
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 	TAILQ_INIT(&pv_dummy.pv_list);
 
 	if (superpages_enabled)
 		pagesizes[1] = L2_SIZE;
 }
 
 #ifdef SMP
 /*
  * For SMP, these functions have to use IPIs for coherence.
  *
  * In general, the calling thread uses a plain fence to order the
  * writes to the page tables before invoking an SBI callback to invoke
  * sfence_vma() on remote CPUs.
  */
 static void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	cpuset_t mask;
 
 	sched_pin();
 	mask = pmap->pm_active;
 	CPU_CLR(PCPU_GET(hart), &mask);
 	fence();
 	if (!CPU_EMPTY(&mask) && smp_started)
 		sbi_remote_sfence_vma(mask.__bits, va, 1);
 	sfence_vma_page(va);
 	sched_unpin();
 }
 
 static void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	cpuset_t mask;
 
 	sched_pin();
 	mask = pmap->pm_active;
 	CPU_CLR(PCPU_GET(hart), &mask);
 	fence();
 	if (!CPU_EMPTY(&mask) && smp_started)
 		sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1);
 
 	/*
 	 * Might consider a loop of sfence_vma_page() for a small
 	 * number of pages in the future.
 	 */
 	sfence_vma();
 	sched_unpin();
 }
 
 static void
 pmap_invalidate_all(pmap_t pmap)
 {
 	cpuset_t mask;
 
 	sched_pin();
 	mask = pmap->pm_active;
 	CPU_CLR(PCPU_GET(hart), &mask);
 
 	/*
 	 * XXX: The SBI doc doesn't detail how to specify x0 as the
 	 * address to perform a global fence.  BBL currently treats
 	 * all sfence_vma requests as global however.
 	 */
 	fence();
 	if (!CPU_EMPTY(&mask) && smp_started)
 		sbi_remote_sfence_vma(mask.__bits, 0, 0);
 	sfence_vma();
 	sched_unpin();
 }
 #else
 /*
  * Normal, non-SMP, invalidation functions.
  * We inline these within pmap.c for speed.
  */
 static __inline void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	sfence_vma_page(va);
 }
 
 static __inline void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	/*
 	 * Might consider a loop of sfence_vma_page() for a small
 	 * number of pages in the future.
 	 */
 	sfence_vma();
 }
 
 static __inline void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	sfence_vma();
 }
 #endif
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l2p, l2;
 	pt_entry_t *l3p, l3;
 	vm_paddr_t pa;
 
 	pa = 0;
 	PMAP_LOCK(pmap);
 	/*
 	 * Start with the l2 tabel. We are unable to allocate
 	 * pages in the l1 table.
 	 */
 	l2p = pmap_l2(pmap, va);
 	if (l2p != NULL) {
 		l2 = pmap_load(l2p);
 		if ((l2 & PTE_RX) == 0) {
 			l3p = pmap_l2_to_l3(l2p, va);
 			if (l3p != NULL) {
 				l3 = pmap_load(l3p);
 				pa = PTE_TO_PHYS(l3);
 				pa |= (va & L3_OFFSET);
 			}
 		} else {
 			/* L2 is superpages */
 			pa = L2PTE_TO_PHYS(l2);
 			pa |= (va & L2_OFFSET);
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pt_entry_t *l3p, l3;
 	vm_paddr_t phys;
 	vm_page_t m;
 
 	m = NULL;
 	PMAP_LOCK(pmap);
 	l3p = pmap_l3(pmap, va);
 	if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) {
 		if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) {
 			phys = PTE_TO_PHYS(l3);
 			m = PHYS_TO_VM_PAGE(phys);
 			if (!vm_page_wire_mapped(m))
 				m = NULL;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pd_entry_t *l2, l2e;
 	pt_entry_t *l3;
 	vm_paddr_t pa;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else {
 		l2 = pmap_l2(kernel_pmap, va);
 		if (l2 == NULL)
 			panic("pmap_kextract: No l2");
 		l2e = pmap_load(l2);
 		/*
 		 * Beware of concurrent promotion and demotion! We must
 		 * use l2e rather than loading from l2 multiple times to
 		 * ensure we see a consistent state, including the
 		 * implicit load in pmap_l2_to_l3.  It is, however, safe
 		 * to use an old l2e because the L3 page is preserved by
 		 * promotion.
 		 */
 		if ((l2e & PTE_RX) != 0) {
 			/* superpages */
 			pa = L2PTE_TO_PHYS(l2e);
 			pa |= (va & L2_OFFSET);
 			return (pa);
 		}
 
 		l3 = pmap_l2_to_l3(&l2e, va);
 		if (l3 == NULL)
 			panic("pmap_kextract: No l3...");
 		pa = PTE_TO_PHYS(pmap_load(l3));
 		pa |= (va & PAGE_MASK);
 	}
 	return (pa);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 void
 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode __unused)
 {
 	pt_entry_t entry;
 	pt_entry_t *l3;
 	vm_offset_t va;
 	pn_t pn;
 
 	KASSERT((pa & L3_OFFSET) == 0,
 	   ("pmap_kenter_device: Invalid physical address"));
 	KASSERT((sva & L3_OFFSET) == 0,
 	   ("pmap_kenter_device: Invalid virtual address"));
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("pmap_kenter_device: Mapping is not page-sized"));
 
 	va = sva;
 	while (size != 0) {
 		l3 = pmap_l3(kernel_pmap, va);
 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
 
 		pn = (pa / PAGE_SIZE);
 		entry = PTE_KERN;
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(l3, entry);
 
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 void
 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
 {
 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *l3;
 
 	l3 = pmap_l3(kernel_pmap, va);
 	KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
 
 	pmap_clear(l3);
 	sfence_vma();
 }
 
 void
 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
 {
 	pt_entry_t *l3;
 	vm_offset_t va;
 
 	KASSERT((sva & L3_OFFSET) == 0,
 	   ("pmap_kremove_device: Invalid virtual address"));
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("pmap_kremove_device: Mapping is not page-sized"));
 
 	va = sva;
 	while (size != 0) {
 		l3 = pmap_l3(kernel_pmap, va);
 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
 		pmap_clear(l3);
 
 		va += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 
 	return PHYS_TO_DMAP(start);
 }
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *l3, pa;
 	vm_offset_t va;
 	vm_page_t m;
 	pt_entry_t entry;
 	pn_t pn;
 	int i;
 
 	va = sva;
 	for (i = 0; i < count; i++) {
 		m = ma[i];
 		pa = VM_PAGE_TO_PHYS(m);
 		pn = (pa / PAGE_SIZE);
 		l3 = pmap_l3(kernel_pmap, va);
 
 		entry = PTE_KERN;
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(l3, entry);
 
 		va += L3_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	pt_entry_t *l3;
 	vm_offset_t va;
 
 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
 
 	for (va = sva; count-- > 0; va += PAGE_SIZE) {
 		l3 = pmap_l3(kernel_pmap, va);
 		KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
 		pmap_clear(l3);
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 bool
 pmap_ps_enabled(pmap_t pmap __unused)
 {
 
 	return (superpages_enabled);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  *
  * If "promoted" is false, then the page table page "ml3" must be zero filled.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0;
 	return (vm_radix_insert(&pmap->pm_root, ml3));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
 }
 
 /*
  * Decrements a page table page's reference count, which is used to record the
  * number of valid page table entries within the page.  If the reference count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->ref_count;
 	if (m->ref_count == 0) {
 		_pmap_unwire_ptp(pmap, va, m, free);
 		return (TRUE);
 	} else {
 		return (FALSE);
 	}
 }
 
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 	vm_paddr_t phys;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (m->pindex >= NUL2E) {
 		pd_entry_t *l1;
 		l1 = pmap_l1(pmap, va);
 		pmap_clear(l1);
 		pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
 	} else {
 		pd_entry_t *l2;
 		l2 = pmap_l2(pmap, va);
 		pmap_clear(l2);
 	}
 	pmap_resident_count_dec(pmap, 1);
 	if (m->pindex < NUL2E) {
 		pd_entry_t *l1;
 		vm_page_t pdpg;
 
 		l1 = pmap_l1(pmap, va);
 		phys = PTE_TO_PHYS(pmap_load(l1));
 		pdpg = PHYS_TO_VM_PAGE(phys);
 		pmap_unwire_ptp(pmap, va, pdpg, free);
 	}
 	pmap_invalidate_page(pmap, va);
 
 	vm_wire_sub(1);
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the reference count.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde));
 	return (pmap_unwire_ptp(pmap, va, mpte, free));
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 	pmap->pm_l1 = kernel_pmap->pm_l1;
 	pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT);
 	CPU_ZERO(&pmap->pm_active);
 	pmap_activate_boot(pmap);
 }
 
 int
 pmap_pinit(pmap_t pmap)
 {
 	vm_paddr_t l1phys;
 	vm_page_t l1pt;
 
 	/*
 	 * allocate the l1 page
 	 */
 	l1pt = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
 	    VM_ALLOC_WAITOK);
 
 	l1phys = VM_PAGE_TO_PHYS(l1pt);
 	pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys);
 	pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT);
 
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 
 	CPU_ZERO(&pmap->pm_active);
 
 	/* Install kernel pagetables */
 	memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE);
 
 	/* Add to the list of all user pmaps */
 	mtx_lock(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock(&allpmaps_lock);
 
 	vm_radix_init(&pmap->pm_root);
 
 	return (1);
 }
 
 /*
  * This routine is called if the desired page table page does not exist.
  *
  * If page table page allocation fails, this routine may sleep before
  * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, /*pdppg, */pdpg;
 	pt_entry_t entry;
 	vm_paddr_t phys;
 	pn_t pn;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Allocate a page table page.
 	 */
 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 	if (m == NULL) {
 		if (lockp != NULL) {
 			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
 			rw_runlock(&pvh_global_lock);
 			vm_wait(NULL);
 			rw_rlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	m->pindex = ptepindex;
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	if (ptepindex >= NUL2E) {
 		pd_entry_t *l1;
 		vm_pindex_t l1index;
 
 		l1index = ptepindex - NUL2E;
 		l1 = &pmap->pm_l1[l1index];
 		KASSERT((pmap_load(l1) & PTE_V) == 0,
 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
 
 		pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE);
 		entry = (PTE_V);
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(l1, entry);
 		pmap_distribute_l1(pmap, l1index, entry);
 	} else {
 		vm_pindex_t l1index;
 		pd_entry_t *l1, *l2;
 
 		l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
 		l1 = &pmap->pm_l1[l1index];
 		if (pmap_load(l1) == 0) {
 			/* recurse for allocating page dir */
 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 		} else {
 			phys = PTE_TO_PHYS(pmap_load(l1));
 			pdpg = PHYS_TO_VM_PAGE(phys);
 			pdpg->ref_count++;
 		}
 
 		phys = PTE_TO_PHYS(pmap_load(l1));
 		l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
 		KASSERT((pmap_load(l2) & PTE_V) == 0,
 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
 
 		pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE);
 		entry = (PTE_V);
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(l2, entry);
 	}
 
 	pmap_resident_count_inc(pmap, 1);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	pd_entry_t *l1;
 	vm_page_t l2pg;
 	vm_pindex_t l2pindex;
 
 retry:
 	l1 = pmap_l1(pmap, va);
 	if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) {
 		KASSERT((pmap_load(l1) & PTE_RWX) == 0,
 		    ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__,
 		    pmap_load(l1), va));
 		/* Add a reference to the L2 page. */
 		l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1)));
 		l2pg->ref_count++;
 	} else {
 		/* Allocate a L2 page. */
 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
 		if (l2pg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (l2pg);
 }
 
 static vm_page_t
 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *l2;
 	vm_paddr_t phys;
 	vm_page_t m;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_l2_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	l2 = pmap_l2(pmap, va);
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (l2 != NULL && pmap_load(l2) != 0) {
 		phys = PTE_TO_PHYS(pmap_load(l2));
 		m = PHYS_TO_VM_PAGE(phys);
 		m->ref_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		m = _pmap_alloc_l3(pmap, ptepindex, lockp);
 		if (m == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (m);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("releasing active pmap %p", pmap));
 
 	mtx_lock(&allpmaps_lock);
 	LIST_REMOVE(pmap, pm_list);
 	mtx_unlock(&allpmaps_lock);
 
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1));
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
     0, 0, kvm_size, "LU",
     "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
     0, 0, kvm_free, "LU",
     "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *l1, *l2;
 	pt_entry_t entry;
 	pn_t pn;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	addr = roundup2(addr, L2_SIZE);
 	if (addr - 1 >= vm_map_max(kernel_map))
 		addr = vm_map_max(kernel_map);
 	while (kernel_vm_end < addr) {
 		l1 = pmap_l1(kernel_pmap, kernel_vm_end);
 		if (pmap_load(l1) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			nkpg->pindex = kernel_vm_end >> L1_SHIFT;
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 
 			pn = (paddr / PAGE_SIZE);
 			entry = (PTE_V);
 			entry |= (pn << PTE_PPN0_S);
 			pmap_store(l1, entry);
 			pmap_distribute_l1(kernel_pmap,
 			    pmap_l1_index(kernel_vm_end), entry);
 			continue; /* try again */
 		}
 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
 		if ((pmap_load(l2) & PTE_V) != 0 &&
 		    (pmap_load(l2) & PTE_RWX) == 0) {
 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 				kernel_vm_end = vm_map_max(kernel_map);
 				break;
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		nkpg->pindex = kernel_vm_end >> L2_SHIFT;
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 
 		pn = (paddr / PAGE_SIZE);
 		entry = (PTE_V);
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(l2, entry);
 
 		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
 
 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 			kernel_vm_end = vm_map_max(kernel_map);
 			break;                       
 		}
 	}
 }
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #if 0
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 #endif
 #endif /* 0 */
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  *
  * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
 static vm_page_t
 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
 
 	panic("RISCVTODO: reclaim_pv_chunk");
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		}
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	mtx_lock(&pv_chunks_mutex);
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 /*
  * Returns a new PV entry, allocating a new PV chunk from the system when
  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
  * returned.
  *
  * The given PV list lock may be released.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = ffsl(pc->pc_map[field]) - 1;
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = reclaim_pv_chunk(pmap, lockp);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	mtx_lock(&pv_chunks_mutex);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
  * Ensure that the number of spare PV entries in the specified pmap meets or
  * exceeds the given count, "needed".
  *
  * The given PV list lock may be released.
  */
 static void
 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 {
 	struct pch new_tail;
 	struct pv_chunk *pc;
 	vm_page_t m;
 	int avail, free;
 	bool reclaimed;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 
 	/*
 	 * Newly allocated PV chunks must be stored in a private list until
 	 * the required number of PV chunks have been allocated.  Otherwise,
 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
 	 * contrast, these chunks must be added to the pmap upon allocation.
 	 */
 	TAILQ_INIT(&new_tail);
 retry:
 	avail = 0;
 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 		bit_count((bitstr_t *)pc->pc_map, 0,
 		    sizeof(pc->pc_map) * NBBY, &free);
 		if (free == 0)
 			break;
 		avail += free;
 		if (avail >= needed)
 			break;
 	}
 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
 		if (m == NULL) {
 			m = reclaim_pv_chunk(pmap, lockp);
 			if (m == NULL)
 				goto retry;
 			reclaimed = true;
 		}
 		/* XXX PV STATS */
 #if 0
 		dump_add_page(m->phys_addr);
 #endif
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
 		pc->pc_map[0] = PC_FREE0;
 		pc->pc_map[1] = PC_FREE1;
 		pc->pc_map[2] = PC_FREE2;
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 
 		/*
 		 * The reclaim might have freed a chunk from the current pmap.
 		 * If that chunk contained available entries, we need to
 		 * re-count the number of available entries.
 		 */
 		if (reclaimed)
 			goto retry;
 	}
 	if (!TAILQ_EMPTY(&new_tail)) {
 		mtx_lock(&pv_chunks_mutex);
 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 	}
 }
 
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
  * 2MB page mappings.
  */
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 			break;
 		}
 	}
 	return (pv);
 }
 
 /*
  * First find and then destroy the pv entry for the specified pmap and virtual
  * address.  This operation can be performed on pv lists for either 4KB or 2MB
  * page mappings.
  */
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Conditionally create the PV entry for a 4KB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * After demotion from a 2MB page mapping to 512 4KB page mappings,
  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  * entries for each of the 4KB page mappings.
  */
 static void __unused
 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_page_t m;
 	vm_offset_t va_last;
 	int bit, field;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
 	 * page's pv list.  Once this transfer begins, the pv list lock
 	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	va &= ~L2_OFFSET;
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	m->md.pv_gen++;
 	/* Instantiate the remaining 511 pv entries. */
 	va_last = va + L2_SIZE - PAGE_SIZE;
 	for (;;) {
 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
 		for (field = 0; field < _NPCM; field++) {
 			while (pc->pc_map[field] != 0) {
 				bit = ffsl(pc->pc_map[field]) - 1;
 				pc->pc_map[field] &= ~(1ul << bit);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va += PAGE_SIZE;
 				pv->pv_va = va;
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_l2: page %p is not managed", m));
 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
 		}
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 out:
 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 	/* XXX PV stats */
 }
 
 #if VM_NRESERVLEVEL > 0
 static void
 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_page_t m;
 	vm_offset_t va_last;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	KASSERT((va & L2_OFFSET) == 0,
 	    ("pmap_pv_promote_l2: misaligned va %#lx", va));
 
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	m = PHYS_TO_VM_PAGE(pa);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 
 	va_last = va + L2_SIZE - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
  * false if the PV entry cannot be allocated without resorting to reclamation.
  */
 static bool
 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_paddr_t pa;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 	    NULL : lockp)) == NULL)
 		return (false);
 	pv->pv_va = va;
 	pa = PTE_TO_PHYS(l2e);
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	return (true);
 }
 
 static void
 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
 {
 	pt_entry_t newl2, oldl2;
 	vm_page_t ml3;
 	vm_paddr_t ml3pa;
 
 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	ml3 = pmap_remove_pt_page(pmap, va);
 	if (ml3 == NULL)
 		panic("pmap_remove_kernel_l2: Missing pt page");
 
 	ml3pa = VM_PAGE_TO_PHYS(ml3);
 	newl2 = ml3pa | PTE_V;
 
 	/*
 	 * If this page table page was unmapped by a promotion, then it
 	 * contains valid mappings.  Zero it to invalidate those mappings.
 	 */
 	if (ml3->valid != 0)
 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
 
 	/*
 	 * Demote the mapping.
 	 */
 	oldl2 = pmap_load_store(l2, newl2);
 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
 	    __func__, l2, oldl2));
 }
 
 /*
  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
  */
 static int
 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t oldl2;
 	vm_offset_t eva, va;
 	vm_page_t m, ml3;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
 	oldl2 = pmap_load_clear(l2);
 	KASSERT((oldl2 & PTE_RWX) != 0,
 	    ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
 
 	/*
 	 * The sfence.vma documentation states that it is sufficient to specify
 	 * a single address within a superpage mapping.  However, since we do
 	 * not perform any invalidation upon promotion, TLBs may still be
 	 * caching 4KB mappings within the superpage, so we must invalidate the
 	 * entire range.
 	 */
 	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
 	if ((oldl2 & PTE_SW_WIRED) != 0)
 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
 	if ((oldl2 & PTE_SW_MANAGED) != 0) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
 		pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + L2_SIZE;
 		for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2));
 		    va < eva; va += PAGE_SIZE, m++) {
 			if ((oldl2 & PTE_D) != 0)
 				vm_page_dirty(m);
 			if ((oldl2 & PTE_A) != 0)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_l2(pmap, l2, sva);
 	} else {
 		ml3 = pmap_remove_pt_page(pmap, sva);
 		if (ml3 != NULL) {
 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_l2: l3 page not promoted"));
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(ml3->ref_count == Ln_ENTRIES,
 			    ("pmap_remove_l2: l3 page ref count error"));
 			ml3->ref_count = 1;
 			vm_page_unwire_noq(ml3);
 			pmap_add_delayed_free_list(ml3, free, FALSE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, sva, l1e, free));
 }
 
 /*
  * pmap_remove_l3: do the things to unmap a page in a process
  */
 static int
 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 
     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t old_l3;
 	vm_paddr_t phys;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	old_l3 = pmap_load_clear(l3);
 	pmap_invalidate_page(pmap, va);
 	if (old_l3 & PTE_SW_WIRED)
 		pmap->pm_stats.wired_count -= 1;
 	pmap_resident_count_dec(pmap, 1);
 	if (old_l3 & PTE_SW_MANAGED) {
 		phys = PTE_TO_PHYS(old_l3);
 		m = PHYS_TO_VM_PAGE(phys);
 		if ((old_l3 & PTE_D) != 0)
 			vm_page_dirty(m);
 		if (old_l3 & PTE_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		pmap_pvh_free(&m->md, pmap, va);
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->flags & PG_FICTITIOUS) == 0) {
 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 
 	return (pmap_unuse_pt(pmap, va, l2e, free));
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct spglist free;
 	struct rwlock *lock;
 	vm_offset_t va, va_next;
 	pd_entry_t *l1, *l2, l2e;
 	pt_entry_t *l3;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	SLIST_INIT(&free);
 
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 
 	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		l1 = pmap_l1(pmap, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (l2 == NULL)
 			continue;
 		if ((l2e = pmap_load(l2)) == 0)
 			continue;
 		if ((l2e & PTE_RWX) != 0) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				(void)pmap_remove_l2(pmap, l2, sva,
 				    pmap_load(l1), &free, &lock);
 				continue;
 			} else if (!pmap_demote_l2_locked(pmap, l2, sva,
 			    &lock)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 			l2e = pmap_load(l2);
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		va = va_next;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			if (pmap_load(l3) == 0) {
 				if (va != va_next) {
 					pmap_invalidate_range(pmap, va, sva);
 					va = va_next;
 				}
 				continue;
 			}
 			if (va == va_next)
 				va = sva;
 			if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
 				sva += L3_SIZE;
 				break;
 			}
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, false);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct spglist free;
 	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *l3, l3e;
 	pd_entry_t *l2, l2e;
 	pv_entry_t pv;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 
 	rw_wlock(&pvh_global_lock);
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		(void)pmap_demote_l2(pmap, l2, va);
 		PMAP_UNLOCK(pmap);
 	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap_resident_count_dec(pmap, 1);
 		l2 = pmap_l2(pmap, pv->pv_va);
 		KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
 		l2e = pmap_load(l2);
 
 		KASSERT((l2e & PTE_RX) == 0,
 		    ("pmap_remove_all: found a superpage in %p's pv list", m));
 
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		l3e = pmap_load_clear(l3);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		if (l3e & PTE_SW_WIRED)
 			pmap->pm_stats.wired_count--;
 		if ((l3e & PTE_A) != 0)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if ((l3e & PTE_D) != 0)
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(&pvh_global_lock);
 	vm_page_free_pages_toq(&free, false);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	pd_entry_t *l1, *l2, l2e;
 	pt_entry_t *l3, l3e, mask;
 	vm_page_t m, mt;
 	vm_paddr_t pa;
 	vm_offset_t va_next;
 	bool anychanged, pv_lists_locked;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
 		return;
 
 	anychanged = false;
 	pv_lists_locked = false;
 	mask = 0;
 	if ((prot & VM_PROT_WRITE) == 0)
 		mask |= PTE_W | PTE_D;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		mask |= PTE_X;
 resume:
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l1 = pmap_l1(pmap, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (l2 == NULL || (l2e = pmap_load(l2)) == 0)
 			continue;
 		if ((l2e & PTE_RWX) != 0) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 retryl2:
 				if ((prot & VM_PROT_WRITE) == 0 &&
 				    (l2e & (PTE_SW_MANAGED | PTE_D)) ==
 				    (PTE_SW_MANAGED | PTE_D)) {
 					pa = PTE_TO_PHYS(l2e);
 					m = PHYS_TO_VM_PAGE(pa);
 					for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
 						vm_page_dirty(mt);
 				}
 				if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
 					goto retryl2;
 				anychanged = true;
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = true;
 					if (!rw_try_rlock(&pvh_global_lock)) {
 						if (anychanged)
 							pmap_invalidate_all(
 							    pmap);
 						PMAP_UNLOCK(pmap);
 						rw_rlock(&pvh_global_lock);
 						goto resume;
 					}
 				}
 				if (!pmap_demote_l2(pmap, l2, sva)) {
 					/*
 					 * The large page mapping was destroyed.
 					 */
 					continue;
 				}
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			l3e = pmap_load(l3);
 retryl3:
 			if ((l3e & PTE_V) == 0)
 				continue;
 			if ((prot & VM_PROT_WRITE) == 0 &&
 			    (l3e & (PTE_SW_MANAGED | PTE_D)) ==
 			    (PTE_SW_MANAGED | PTE_D)) {
 				m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e));
 				vm_page_dirty(m);
 			}
 			if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
 				goto retryl3;
 			anychanged = true;
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	if (pv_lists_locked)
 		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 int
 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
 {
 	pd_entry_t *l2, l2e;
 	pt_entry_t bits, *pte, oldpte;
 	int rv;
 
 	rv = 0;
 	PMAP_LOCK(pmap);
 	l2 = pmap_l2(pmap, va);
 	if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
 		goto done;
 	if ((l2e & PTE_RWX) == 0) {
 		pte = pmap_l2_to_l3(l2, va);
 		if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0)
 			goto done;
 	} else {
 		pte = l2;
 		oldpte = l2e;
 	}
 
 	if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) ||
 	    (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) ||
 	    (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) ||
 	    (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0))
 		goto done;
 
 	bits = PTE_A;
 	if (ftype == VM_PROT_WRITE)
 		bits |= PTE_D;
 
 	/*
 	 * Spurious faults can occur if the implementation caches invalid
 	 * entries in the TLB, or if simultaneous accesses on multiple CPUs
 	 * race with each other.
 	 */
 	if ((oldpte & bits) != bits)
 		pmap_store_bits(pte, bits);
 	sfence_vma();
 	rv = 1;
 done:
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 static bool
 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va)
 {
 	struct rwlock *lock;
 	bool rv;
 
 	lock = NULL;
 	rv = pmap_demote_l2_locked(pmap, l2, va, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	return (rv);
 }
 
 /*
  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
  * mapping is invalidated.
  */
 static bool
 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
     struct rwlock **lockp)
 {
 	struct spglist free;
 	vm_page_t mpte;
 	pd_entry_t newl2, oldl2;
 	pt_entry_t *firstl3, newl3;
 	vm_paddr_t mptepa;
 	int i;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	oldl2 = pmap_load(l2);
 	KASSERT((oldl2 & PTE_RWX) != 0,
 	    ("pmap_demote_l2_locked: oldl2 is not a leaf entry"));
 	if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
 	    NULL) {
 		if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj(
 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
 		    VM_ALLOC_WIRED)) == NULL) {
 			SLIST_INIT(&free);
 			(void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET,
 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
 			vm_page_free_pages_toq(&free, true);
 			CTR2(KTR_PMAP, "pmap_demote_l2_locked: "
 			    "failure for va %#lx in pmap %p", va, pmap);
 			return (false);
 		}
 		mpte->pindex = pmap_l2_pindex(va);
 		if (va < VM_MAXUSER_ADDRESS) {
 			mpte->ref_count = Ln_ENTRIES;
 			pmap_resident_count_inc(pmap, 1);
 		}
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 	newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
 	KASSERT((oldl2 & PTE_A) != 0,
 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_A"));
 	KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W,
 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_D"));
 	newl3 = oldl2;
 
 	/*
 	 * If the page table page is not leftover from an earlier promotion,
 	 * initialize it.
 	 */
 	if (mpte->valid == 0) {
 		for (i = 0; i < Ln_ENTRIES; i++)
 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
 	}
 	KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3),
 	    ("pmap_demote_l2_locked: firstl3 and newl3 map different physical "
 	    "addresses"));
 
 	/*
 	 * If the mapping has changed attributes, update the page table
 	 * entries.
 	 */
 	if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE))
 		for (i = 0; i < Ln_ENTRIES; i++)
 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
 
 	/*
 	 * The spare PV entries must be reserved prior to demoting the
 	 * mapping, that is, prior to changing the L2 entry.  Otherwise, the
 	 * state of the L2 entry and the PV lists will be inconsistent, which
 	 * can result in reclaim_pv_chunk() attempting to remove a PV entry from
 	 * the wrong PV list and pmap_pv_demote_l2() failing to find the
 	 * expected PV entry for the 2MB page mapping that is being demoted.
 	 */
 	if ((oldl2 & PTE_SW_MANAGED) != 0)
 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
 
 	/*
 	 * Demote the mapping.
 	 */
 	pmap_store(l2, newl2);
 
 	/*
 	 * Demote the PV entry.
 	 */
 	if ((oldl2 & PTE_SW_MANAGED) != 0)
 		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
 
 	atomic_add_long(&pmap_l2_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p",
 	    va, pmap);
 	return (true);
 }
 
 #if VM_NRESERVLEVEL > 0
 static void
 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pt_entry_t *firstl3, firstl3e, *l3, l3e;
 	vm_paddr_t pa;
 	vm_page_t ml3;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	va &= ~L2_OFFSET;
 	KASSERT((pmap_load(l2) & PTE_RWX) == 0,
 	    ("pmap_promote_l2: invalid l2 entry %p", l2));
 
 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
 	firstl3e = pmap_load(firstl3);
 	pa = PTE_TO_PHYS(firstl3e);
 	if ((pa & L2_OFFSET) != 0) {
 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
 		    va, pmap);
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		return;
 	}
 
 	/*
 	 * Downgrade a clean, writable mapping to read-only to ensure that the
 	 * hardware does not set PTE_D while we are comparing PTEs.
 	 *
 	 * Upon a write access to a clean mapping, the implementation will
 	 * either atomically check protections and set PTE_D, or raise a page
 	 * fault.  In the latter case, the pmap lock provides atomicity.  Thus,
 	 * we do not issue an sfence.vma here and instead rely on pmap_fault()
 	 * to do so lazily.
 	 */
 	while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) {
 		if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) {
 			firstl3e &= ~PTE_W;
 			break;
 		}
 	}
 
 	pa += PAGE_SIZE;
 	for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) {
 		l3e = pmap_load(l3);
 		if (PTE_TO_PHYS(l3e) != pa) {
 			CTR2(KTR_PMAP,
 			    "pmap_promote_l2: failure for va %#lx pmap %p",
 			    va, pmap);
 			atomic_add_long(&pmap_l2_p_failures, 1);
 			return;
 		}
 		while ((l3e & (PTE_W | PTE_D)) == PTE_W) {
 			if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) {
 				l3e &= ~PTE_W;
 				break;
 			}
 		}
 		if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) {
 			CTR2(KTR_PMAP,
 			    "pmap_promote_l2: failure for va %#lx pmap %p",
 			    va, pmap);
 			atomic_add_long(&pmap_l2_p_failures, 1);
 			return;
 		}
 		pa += PAGE_SIZE;
 	}
 
 	ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
 	KASSERT(ml3->pindex == pmap_l2_pindex(va),
 	    ("pmap_promote_l2: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, ml3, true)) {
 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
 		    va, pmap);
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		return;
 	}
 
 	if ((firstl3e & PTE_SW_MANAGED) != 0)
 		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp);
 
 	pmap_store(l2, firstl3e);
 
 	atomic_add_long(&pmap_l2_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
 	    pmap);
 }
 #endif
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	struct rwlock *lock;
 	pd_entry_t *l1, *l2, l2e;
 	pt_entry_t new_l3, orig_l3;
 	pt_entry_t *l3;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa, l2_pa, l3_pa;
 	vm_page_t mpte, om, l2_m, l3_m;
 	pt_entry_t entry;
 	pn_t l2_pn, l3_pn, pn;
 	int rv;
 	bool nosleep;
 
 	va = trunc_page(va);
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
 	pa = VM_PAGE_TO_PHYS(m);
 	pn = (pa / PAGE_SIZE);
 
 	new_l3 = PTE_V | PTE_R | PTE_A;
 	if (prot & VM_PROT_EXECUTE)
 		new_l3 |= PTE_X;
 	if (flags & VM_PROT_WRITE)
 		new_l3 |= PTE_D;
 	if (prot & VM_PROT_WRITE)
 		new_l3 |= PTE_W;
 	if (va < VM_MAX_USER_ADDRESS)
 		new_l3 |= PTE_U;
 
 	new_l3 |= (pn << PTE_PPN0_S);
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		new_l3 |= PTE_SW_WIRED;
 
 	/*
 	 * Set modified bit gratuitously for writeable mappings if
 	 * the page is unmanaged. We do not want to take a fault
 	 * to do the dirty bit accounting for these mappings.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) != 0) {
 		if (prot & VM_PROT_WRITE)
 			new_l3 |= PTE_D;
 	} else
 		new_l3 |= PTE_SW_MANAGED;
 
 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
 
 	lock = NULL;
 	mpte = NULL;
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	if (psind == 1) {
 		/* Assert the required virtual and physical alignment. */
 		KASSERT((va & L2_OFFSET) == 0,
 		    ("pmap_enter: va %#lx unaligned", va));
 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 		rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock);
 		goto out;
 	}
 
 	l2 = pmap_l2(pmap, va);
 	if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
 	    ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2,
 	    va, &lock))) {
 		l3 = pmap_l2_to_l3(l2, va);
 		if (va < VM_MAXUSER_ADDRESS) {
 			mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
 			mpte->ref_count++;
 		}
 	} else if (va < VM_MAXUSER_ADDRESS) {
 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
 		if (mpte == NULL && nosleep) {
 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
 			if (lock != NULL)
 				rw_wunlock(lock);
 			rw_runlock(&pvh_global_lock);
 			PMAP_UNLOCK(pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		l3 = pmap_l3(pmap, va);
 	} else {
 		l3 = pmap_l3(pmap, va);
 		/* TODO: This is not optimal, but should mostly work */
 		if (l3 == NULL) {
 			if (l2 == NULL) {
 				l2_m = vm_page_alloc_noobj(VM_ALLOC_WIRED |
 				    VM_ALLOC_ZERO);
 				if (l2_m == NULL)
 					panic("pmap_enter: l2 pte_m == NULL");
 
 				l2_pa = VM_PAGE_TO_PHYS(l2_m);
 				l2_pn = (l2_pa / PAGE_SIZE);
 
 				l1 = pmap_l1(pmap, va);
 				entry = (PTE_V);
 				entry |= (l2_pn << PTE_PPN0_S);
 				pmap_store(l1, entry);
 				pmap_distribute_l1(pmap, pmap_l1_index(va), entry);
 				l2 = pmap_l1_to_l2(l1, va);
 			}
 
 			l3_m = vm_page_alloc_noobj(VM_ALLOC_WIRED |
 			    VM_ALLOC_ZERO);
 			if (l3_m == NULL)
 				panic("pmap_enter: l3 pte_m == NULL");
-			if ((l3_m->flags & PG_ZERO) == 0)
-				pmap_zero_page(l3_m);
 
 			l3_pa = VM_PAGE_TO_PHYS(l3_m);
 			l3_pn = (l3_pa / PAGE_SIZE);
 			entry = (PTE_V);
 			entry |= (l3_pn << PTE_PPN0_S);
 			pmap_store(l2, entry);
 			l3 = pmap_l2_to_l3(l2, va);
 		}
 		pmap_invalidate_page(pmap, va);
 	}
 
 	orig_l3 = pmap_load(l3);
 	opa = PTE_TO_PHYS(orig_l3);
 	pv = NULL;
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if ((orig_l3 & PTE_V) != 0) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
 		    (orig_l3 & PTE_SW_WIRED) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
 		    (orig_l3 & PTE_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->ref_count--;
 			KASSERT(mpte->ref_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((orig_l3 & PTE_SW_MANAGED) != 0 &&
 			    (new_l3 & PTE_W) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			goto validate;
 		}
 
 		/*
 		 * The physical page has changed.  Temporarily invalidate
 		 * the mapping.  This ensures that all threads sharing the
 		 * pmap keep a consistent view of the mapping, which is
 		 * necessary for the correct handling of COW faults.  It
 		 * also permits reuse of the old mapping's PV entry,
 		 * avoiding an allocation.
 		 *
 		 * For consistency, handle unmanaged mappings the same way.
 		 */
 		orig_l3 = pmap_load_clear(l3);
 		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((orig_l3 & PTE_SW_MANAGED) != 0) {
 			om = PHYS_TO_VM_PAGE(opa);
 
 			/*
 			 * The pmap lock is sufficient to synchronize with
 			 * concurrent calls to pmap_page_test_mappings() and
 			 * pmap_ts_referenced().
 			 */
 			if ((orig_l3 & PTE_D) != 0)
 				vm_page_dirty(om);
 			if ((orig_l3 & PTE_A) != 0)
 				vm_page_aflag_set(om, PGA_REFERENCED);
 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 			KASSERT(pv != NULL,
 			    ("pmap_enter: no PV entry for %#lx", va));
 			if ((new_l3 & PTE_SW_MANAGED) == 0)
 				free_pv_entry(pmap, pv);
 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		}
 		pmap_invalidate_page(pmap, va);
 		orig_l3 = 0;
 	} else {
 		/*
 		 * Increment the counters.
 		 */
 		if ((new_l3 & PTE_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap_resident_count_inc(pmap, 1);
 	}
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((new_l3 & PTE_SW_MANAGED) != 0) {
 		if (pv == NULL) {
 			pv = get_pv_entry(pmap, &lock);
 			pv->pv_va = va;
 		}
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if ((new_l3 & PTE_W) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 validate:
 	/*
 	 * Sync the i-cache on all harts before updating the PTE
 	 * if the new PTE is executable.
 	 */
 	if (prot & VM_PROT_EXECUTE)
 		pmap_sync_icache(pmap, va, PAGE_SIZE);
 
 	/*
 	 * Update the L3 entry.
 	 */
 	if (orig_l3 != 0) {
 		orig_l3 = pmap_load_store(l3, new_l3);
 		pmap_invalidate_page(pmap, va);
 		KASSERT(PTE_TO_PHYS(orig_l3) == pa,
 		    ("pmap_enter: invalid update"));
 		if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) ==
 		    (PTE_D | PTE_SW_MANAGED))
 			vm_page_dirty(m);
 	} else {
 		pmap_store(l3, new_l3);
 	}
 
 #if VM_NRESERVLEVEL > 0
 	if (mpte != NULL && mpte->ref_count == Ln_ENTRIES &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_l2(pmap, l2, va, &lock);
 #endif
 
 	rv = KERN_SUCCESS;
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
  * if successful.  Returns false if (1) a page table page cannot be allocated
  * without sleeping, (2) a mapping already exists at the specified virtual
  * address, or (3) a PV entry cannot be allocated without reclaiming another
  * PV entry.
  */
 static bool
 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     struct rwlock **lockp)
 {
 	pd_entry_t new_l2;
 	pn_t pn;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE;
 	new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V);
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		new_l2 |= PTE_SW_MANAGED;
 	if ((prot & VM_PROT_EXECUTE) != 0)
 		new_l2 |= PTE_X;
 	if (va < VM_MAXUSER_ADDRESS)
 		new_l2 |= PTE_U;
 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 	    KERN_SUCCESS);
 }
 
 /*
  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
  * a mapping already exists at the specified virtual address.  Returns
  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  *
  * The parameter "m" is only used when creating a managed, writeable mapping.
  */
 static int
 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
     vm_page_t m, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t *l2, *l3, oldl2;
 	vm_offset_t sva;
 	vm_page_t l2pg, mt;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
 	    NULL : lockp)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
 		    va, pmap);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 
 	l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
 	l2 = &l2[pmap_l2_index(va)];
 	if ((oldl2 = pmap_load(l2)) != 0) {
 		KASSERT(l2pg->ref_count > 1,
 		    ("pmap_enter_l2: l2pg's ref count is too low"));
 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 			l2pg->ref_count--;
 			CTR2(KTR_PMAP,
 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
 			    va, pmap);
 			return (KERN_FAILURE);
 		}
 		SLIST_INIT(&free);
 		if ((oldl2 & PTE_RWX) != 0)
 			(void)pmap_remove_l2(pmap, l2, va,
 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
 		else
 			for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) {
 				l3 = pmap_l2_to_l3(l2, sva);
 				if ((pmap_load(l3) & PTE_V) != 0 &&
 				    pmap_remove_l3(pmap, l3, sva, oldl2, &free,
 				    lockp) != 0)
 					break;
 			}
 		vm_page_free_pages_toq(&free, true);
 		if (va >= VM_MAXUSER_ADDRESS) {
 			/*
 			 * Both pmap_remove_l2() and pmap_remove_l3() will
 			 * leave the kernel page table page zero filled.
 			 */
 			mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
 			if (pmap_insert_pt_page(pmap, mt, false))
 				panic("pmap_enter_l2: trie insert failed");
 		} else
 			KASSERT(pmap_load(l2) == 0,
 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
 	}
 
 	if ((new_l2 & PTE_SW_MANAGED) != 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, l2pg, &free)) {
 				/*
 				 * Although "va" is not mapped, paging-structure
 				 * caches could nonetheless have entries that
 				 * refer to the freed page table pages.
 				 * Invalidate those entries.
 				 */
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			CTR2(KTR_PMAP,
 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
 			    va, pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		if ((new_l2 & PTE_W) != 0)
 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 				vm_page_aflag_set(mt, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Increment counters.
 	 */
 	if ((new_l2 & PTE_SW_WIRED) != 0)
 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
 
 	/*
 	 * Map the superpage.
 	 */
 	pmap_store(l2, new_l2);
 
 	atomic_add_long(&pmap_l2_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
 	    va, pmap);
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	lock = NULL;
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
 			m = &m[L2_SIZE / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
 			    &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	struct rwlock *lock;
 
 	lock = NULL;
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	struct spglist free;
 	vm_paddr_t phys;
 	pd_entry_t *l2;
 	pt_entry_t *l3, newl3;
 
 	KASSERT(!VA_IS_CLEANMAP(va) ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t l2pindex;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		l2pindex = pmap_l2_pindex(va);
 		if (mpte && (mpte->pindex == l2pindex)) {
 			mpte->ref_count++;
 		} else {
 			/*
 			 * Get the l2 entry
 			 */
 			l2 = pmap_l2(pmap, va);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.  Otherwise, we
 			 * attempt to allocate a page table page.  If this
 			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (l2 != NULL && pmap_load(l2) != 0) {
 				phys = PTE_TO_PHYS(pmap_load(l2));
 				mpte = PHYS_TO_VM_PAGE(phys);
 				mpte->ref_count++;
 			} else {
 				/*
 				 * Pass NULL instead of the PV list lock
 				 * pointer, because we don't intend to sleep.
 				 */
 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 		l3 = &l3[pmap_l3_index(va)];
 	} else {
 		mpte = NULL;
 		l3 = pmap_l3(kernel_pmap, va);
 	}
 	if (l3 == NULL)
 		panic("pmap_enter_quick_locked: No l3");
 	if (pmap_load(l3) != 0) {
 		if (mpte != NULL) {
 			mpte->ref_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, false);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap_resident_count_inc(pmap, 1);
 
 	newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) |
 	    PTE_V | PTE_R;
 	if ((prot & VM_PROT_EXECUTE) != 0)
 		newl3 |= PTE_X;
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newl3 |= PTE_SW_MANAGED;
 	if (va < VM_MAX_USER_ADDRESS)
 		newl3 |= PTE_U;
 
 	/*
 	 * Sync the i-cache on all harts before updating the PTE
 	 * if the new PTE is executable.
 	 */
 	if (prot & VM_PROT_EXECUTE)
 		pmap_sync_icache(pmap, va, PAGE_SIZE);
 
 	pmap_store(l3, newl3);
 
 	pmap_invalidate_page(pmap, va);
 	return (mpte);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware feature,
  *	so there is no need to invalidate any TLB entries.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pd_entry_t *l1, *l2, l2e;
 	pt_entry_t *l3, l3e;
 	bool pv_lists_locked;
 
 	pv_lists_locked = false;
 retry:
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l1 = pmap_l1(pmap, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if ((l2e = pmap_load(l2)) == 0)
 			continue;
 		if ((l2e & PTE_RWX) != 0) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				if ((l2e & PTE_SW_WIRED) == 0)
 					panic("pmap_unwire: l2 %#jx is missing "
 					    "PTE_SW_WIRED", (uintmax_t)l2e);
 				pmap_clear_bits(l2, PTE_SW_WIRED);
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = true;
 					if (!rw_try_rlock(&pvh_global_lock)) {
 						PMAP_UNLOCK(pmap);
 						rw_rlock(&pvh_global_lock);
 						/* Repeat sva. */
 						goto retry;
 					}
 				}
 				if (!pmap_demote_l2(pmap, l2, sva))
 					panic("pmap_unwire: demotion failed");
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			if ((l3e = pmap_load(l3)) == 0)
 				continue;
 			if ((l3e & PTE_SW_WIRED) == 0)
 				panic("pmap_unwire: l3 %#jx is missing "
 				    "PTE_SW_WIRED", (uintmax_t)l3e);
 
 			/*
 			 * PG_W must be cleared atomically.  Although the pmap
 			 * lock synchronizes access to PG_W, another processor
 			 * could be setting PG_M and/or PG_A concurrently.
 			 */
 			pmap_clear_bits(l3, PTE_SW_WIRED);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	if (pv_lists_locked)
 		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 
 }
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
 	vm_page_t m_a, m_b;
 	vm_paddr_t p_a, p_b;
 	vm_offset_t a_pg_offset, b_pg_offset;
 	int cnt;
 
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
 		m_a = ma[a_offset >> PAGE_SHIFT];
 		p_a = m_a->phys_addr;
 		b_pg_offset = b_offset & PAGE_MASK;
 		m_b = mb[b_offset >> PAGE_SHIFT];
 		p_b = m_b->phys_addr;
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
 			panic("!DMAP a %lx", p_a);
 		} else {
 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
 		}
 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
 			panic("!DMAP b %lx", p_b);
 		} else {
 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
 		}
 		bcopy(a_cp, b_cp, cnt);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 
 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	rw_rlock(&pvh_global_lock);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_runlock(lock);
 	rw_runlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pmap_t pmap;
 	pd_entry_t *l2;
 	pt_entry_t *l3;
 	pv_entry_t pv;
 	int count, md_gen, pvh_gen;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (0);
 	rw_rlock(&pvh_global_lock);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	count = 0;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		l2 = pmap_l2(pmap, pv->pv_va);
 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		if ((pmap_load(l3) & PTE_SW_WIRED) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			l2 = pmap_l2(pmap, pv->pv_va);
 			if ((pmap_load(l2) & PTE_SW_WIRED) != 0)
 				count++;
 			PMAP_UNLOCK(pmap);
 		}
 	}
 	rw_runlock(lock);
 	rw_runlock(&pvh_global_lock);
 	return (count);
 }
 
 /*
  * Returns true if the given page is mapped individually or as part of
  * a 2mpage.  Otherwise, returns false.
  */
 bool
 pmap_page_is_mapped(vm_page_t m)
 {
 	struct rwlock *lock;
 	bool rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (false);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 	rw_runlock(lock);
 	return (rv);
 }
 
 static void
 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv,
     struct spglist *free, bool superpage)
 {
 	struct md_page *pvh;
 	vm_page_t mpte, mt;
 
 	if (superpage) {
 		pmap_resident_count_dec(pmap, Ln_ENTRIES);
 		pvh = pa_to_pvh(m->phys_addr);
 		TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 		pvh->pv_gen++;
 		if (TAILQ_EMPTY(&pvh->pv_list)) {
 			for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
 				if (TAILQ_EMPTY(&mt->md.pv_list) &&
 				    (mt->a.flags & PGA_WRITEABLE) != 0)
 					vm_page_aflag_clear(mt, PGA_WRITEABLE);
 		}
 		mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 		if (mpte != NULL) {
 			KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_pages: pte page not promoted"));
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(mpte->ref_count == Ln_ENTRIES,
 			    ("pmap_remove_pages: pte page ref count error"));
 			mpte->ref_count = 0;
 			pmap_add_delayed_free_list(mpte, free, FALSE);
 		}
 	} else {
 		pmap_resident_count_dec(pmap, 1);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->a.flags & PGA_WRITEABLE) != 0) {
 			pvh = pa_to_pvh(m->phys_addr);
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 }
 
 /*
  * Destroy all managed, non-wired mappings in the given user-space
  * pmap.  This pmap cannot be active on any processor besides the
  * caller.
  *
  * This function cannot be applied to the kernel pmap.  Moreover, it
  * is not intended for general use.  It is only to be used during
  * process termination.  Consequently, it can be implemented in ways
  * that make it faster than pmap_remove().  First, it can more quickly
  * destroy mappings by iterating over the pmap's collection of PV
  * entries, rather than searching the page table.  Second, it doesn't
  * have to test and clear the page table entries atomically, because
  * no processor is currently accessing the user address space.  In
  * particular, a page table entry's dirty bit won't change state once
  * this function starts.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	struct spglist free;
 	pd_entry_t ptepde;
 	pt_entry_t *pte, tpte;
 	vm_page_t m, mt;
 	pv_entry_t pv;
 	struct pv_chunk *pc, *npc;
 	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, idx;
 	bool superpage;
 
 	lock = NULL;
 
 	SLIST_INIT(&free);
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = ffsl(inuse) - 1;
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = pmap_l1(pmap, pv->pv_va);
 				ptepde = pmap_load(pte);
 				pte = pmap_l1_to_l2(pte, pv->pv_va);
 				tpte = pmap_load(pte);
 				if ((tpte & PTE_RWX) != 0) {
 					superpage = true;
 				} else {
 					ptepde = tpte;
 					pte = pmap_l2_to_l3(pte, pv->pv_va);
 					tpte = pmap_load(pte);
 					superpage = false;
 				}
 
 				/*
 				 * We cannot remove wired pages from a
 				 * process' mapping at this time.
 				 */
 				if (tpte & PTE_SW_WIRED) {
 					allfree = 0;
 					continue;
 				}
 
 				m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte));
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad pte %#jx",
 				    (uintmax_t)tpte));
 
 				pmap_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PTE_D | PTE_W)) ==
 				    (PTE_D | PTE_W)) {
 					if (superpage)
 						for (mt = m;
 						    mt < &m[Ln_ENTRIES]; mt++)
 							vm_page_dirty(mt);
 					else
 						vm_page_dirty(m);
 				}
 
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
 
 				pmap_remove_pages_pv(pmap, m, pv, &free,
 				    superpage);
 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 				freed++;
 			}
 		}
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_all(pmap);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, false);
 }
 
 static bool
 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pd_entry_t *l2;
 	pt_entry_t *l3, mask;
 	pv_entry_t pv;
 	pmap_t pmap;
 	int md_gen, pvh_gen;
 	bool rv;
 
 	mask = 0;
 	if (modified)
 		mask |= PTE_D;
 	if (accessed)
 		mask |= PTE_A;
 
 	rv = FALSE;
 	rw_rlock(&pvh_global_lock);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		l2 = pmap_l2(pmap, pv->pv_va);
 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		rv = (pmap_load(l3) & mask) == mask;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			goto out;
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			l2 = pmap_l2(pmap, pv->pv_va);
 			rv = (pmap_load(l2) & mask) == mask;
 			PMAP_UNLOCK(pmap);
 			if (rv)
 				goto out;
 		}
 	}
 out:
 	rw_runlock(lock);
 	rw_runlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not busied then this check is racy.
 	 */
 	if (!pmap_page_is_write_mapped(m))
 		return (FALSE);
 	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *l3;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	l3 = pmap_l3(pmap, addr);
 	if (l3 != NULL && pmap_load(l3) != 0) {
 		rv = TRUE;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pmap_t pmap;
 	pd_entry_t *l2;
 	pt_entry_t *l3, oldl3, newl3;
 	pv_entry_t next_pv, pv;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 	vm_page_assert_busied(m);
 
 	if (!pmap_page_is_write_mapped(m))
 		return;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	rw_rlock(&pvh_global_lock);
 retry_pv_loop:
 	rw_wlock(lock);
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		if ((pmap_load(l2) & PTE_W) != 0)
 			(void)pmap_demote_l2_locked(pmap, l2, va, &lock);
 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 		    ("inconsistent pv lock %p %p for page %p",
 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		l2 = pmap_l2(pmap, pv->pv_va);
 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		oldl3 = pmap_load(l3);
 retry:
 		if ((oldl3 & PTE_W) != 0) {
 			newl3 = oldl3 & ~(PTE_D | PTE_W);
 			if (!atomic_fcmpset_long(l3, &oldl3, newl3))
 				goto retry;
 			if ((oldl3 & PTE_D) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_runlock(&pvh_global_lock);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct spglist free;
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	pd_entry_t *l2, l2e;
 	pt_entry_t *l3, l3e;
 	vm_paddr_t pa;
 	vm_offset_t va;
 	int cleared, md_gen, not_cleared, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	SLIST_INIT(&free);
 	cleared = 0;
 	pa = VM_PAGE_TO_PHYS(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 
 	lock = PHYS_TO_PV_LIST_LOCK(pa);
 	rw_rlock(&pvh_global_lock);
 	rw_wlock(lock);
 retry:
 	not_cleared = 0;
 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		l2e = pmap_load(l2);
 		if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) {
 			/*
 			 * Although l2e is mapping a 2MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 		if ((l2e & PTE_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB
 			 * pages, it should not be cleared every time it is
 			 * tested.  Apply a simple "hash" function on the
 			 * physical page number, the virtual superpage number,
 			 * and the pmap address to select one 4KB page out of
 			 * the 512 on which testing the reference bit will
 			 * result in clearing that reference bit.  This
 			 * function is designed to avoid the selection of the
 			 * same 4KB page for every 2MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
 			    (l2e & PTE_SW_WIRED) == 0) {
 				pmap_clear_bits(l2, PTE_A);
 				pmap_invalidate_page(pmap, va);
 				cleared++;
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 		}
 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		l2 = pmap_l2(pmap, pv->pv_va);
 
 		KASSERT((pmap_load(l2) & PTE_RX) == 0,
 		    ("pmap_ts_referenced: found an invalid l2 table"));
 
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		l3e = pmap_load(l3);
 		if ((l3e & PTE_D) != 0)
 			vm_page_dirty(m);
 		if ((l3e & PTE_A) != 0) {
 			if ((l3e & PTE_SW_WIRED) == 0) {
 				/*
 				 * Wired pages cannot be paged out so
 				 * doing accessed bit emulation for
 				 * them is wasted effort. We do the
 				 * hard work for unwired pages only.
 				 */
 				pmap_clear_bits(l3, PTE_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 			m->md.pv_gen++;
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 	    not_cleared < PMAP_TS_REFERENCED_MAX);
 out:
 	rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	vm_page_free_pages_toq(&free, false);
 	return (cleared + not_cleared);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *l2, oldl2;
 	pt_entry_t *l3;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	vm_page_assert_busied(m);
 
 	if (!pmap_page_is_write_mapped(m))
 	        return;
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 	 * If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->a.flags & PGA_WRITEABLE) == 0)
 		return;
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(&pvh_global_lock);
 	rw_wlock(lock);
 restart:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		oldl2 = pmap_load(l2);
 		/* If oldl2 has PTE_W set, then it also has PTE_D set. */
 		if ((oldl2 & PTE_W) != 0 &&
 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
 		    (oldl2 & PTE_SW_WIRED) == 0) {
 			/*
 			 * Write protect the mapping to a single page so that
 			 * a subsequent write access may repromote.
 			 */
 			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
 			l3 = pmap_l2_to_l3(l2, va);
 			pmap_clear_bits(l3, PTE_D | PTE_W);
 			vm_page_dirty(m);
 			pmap_invalidate_page(pmap, va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		l2 = pmap_l2(pmap, pv->pv_va);
 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) {
 			pmap_clear_bits(l3, PTE_D | PTE_W);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
         return ((void *)PHYS_TO_DMAP(pa));
 }
 
 void
 pmap_unmapbios(vm_paddr_t pa, vm_size_t size)
 {
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pv_memattr = ma;
 
 	/*
 	 * If "m" is a normal page, update its direct mapping.  This update
 	 * can be relied upon to perform any cache operations that are
 	 * required for data coherence.
 	 */
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 	    m->md.pv_memattr) != 0)
 		panic("memory attribute change on the direct map failed");
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the direct map or the kernel map.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.  In the
  * latter case, the memory type may have been changed on some part of the
  * virtual address range.
  */
 int
 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 {
 	int error;
 
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_attr_locked(va, size, mode);
 	PMAP_UNLOCK(kernel_pmap);
 	return (error);
 }
 
 static int
 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 {
 	vm_offset_t base, offset, tmpva;
 	pd_entry_t *l1, l1e;
 	pd_entry_t *l2, l2e;
 	pt_entry_t *l3, l3e;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	if (!VIRT_IN_DMAP(base) &&
 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
 		return (EINVAL);
 
 	for (tmpva = base; tmpva < base + size; ) {
 		l1 = pmap_l1(kernel_pmap, tmpva);
 		if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0)
 			return (EINVAL);
 		if ((l1e & PTE_RWX) != 0) {
 			/*
 			 * TODO: Demote if attributes don't match and there
 			 * isn't an L1 page left in the range, and update the
 			 * L1 entry if the attributes don't match but there is
 			 * an L1 page left in the range, once we support the
 			 * upcoming Svpbmt extension.
 			 */
 			tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
 			continue;
 		}
 		l2 = pmap_l1_to_l2(l1, tmpva);
 		if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
 			return (EINVAL);
 		if ((l2e & PTE_RWX) != 0) {
 			/*
 			 * TODO: Demote if attributes don't match and there
 			 * isn't an L2 page left in the range, and update the
 			 * L2 entry if the attributes don't match but there is
 			 * an L2 page left in the range, once we support the
 			 * upcoming Svpbmt extension.
 			 */
 			tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
 			continue;
 		}
 		l3 = pmap_l2_to_l3(l2, tmpva);
 		if (l3 == NULL || ((l3e = pmap_load(l3)) & PTE_V) == 0)
 			return (EINVAL);
 		/*
 		 * TODO: Update the L3 entry if the attributes don't match once
 		 * we support the upcoming Svpbmt extension.
 		 */
 		tmpva += PAGE_SIZE;
 	}
 
 	return (0);
 }
 
 /*
  * Perform the pmap work for mincore(2).  If the page is not both referenced and
  * modified by this pmap, returns its physical address so that the caller can
  * find other mappings.
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
 {
 	pt_entry_t *l2, *l3, tpte;
 	vm_paddr_t pa;
 	int val;
 	bool managed;
 
 	PMAP_LOCK(pmap);
 	l2 = pmap_l2(pmap, addr);
 	if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) {
 		if ((tpte & PTE_RWX) != 0) {
 			pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET);
 			val = MINCORE_INCORE | MINCORE_PSIND(1);
 		} else {
 			l3 = pmap_l2_to_l3(l2, addr);
 			tpte = pmap_load(l3);
 			if ((tpte & PTE_V) == 0) {
 				PMAP_UNLOCK(pmap);
 				return (0);
 			}
 			pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET);
 			val = MINCORE_INCORE;
 		}
 
 		if ((tpte & PTE_D) != 0)
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((tpte & PTE_A) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 		managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED;
 	} else {
 		managed = false;
 		val = 0;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
 		*pap = pa;
 	}
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 void
 pmap_activate_sw(struct thread *td)
 {
 	pmap_t oldpmap, pmap;
 	u_int hart;
 
 	oldpmap = PCPU_GET(curpmap);
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	if (pmap == oldpmap)
 		return;
 	load_satp(pmap->pm_satp);
 
 	hart = PCPU_GET(hart);
 #ifdef SMP
 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
 	CPU_CLR_ATOMIC(hart, &oldpmap->pm_active);
 #else
 	CPU_SET(hart, &pmap->pm_active);
 	CPU_CLR(hart, &oldpmap->pm_active);
 #endif
 	PCPU_SET(curpmap, pmap);
 
 	sfence_vma();
 }
 
 void
 pmap_activate(struct thread *td)
 {
 
 	critical_enter();
 	pmap_activate_sw(td);
 	critical_exit();
 }
 
 void
 pmap_activate_boot(pmap_t pmap)
 {
 	u_int hart;
 
 	hart = PCPU_GET(hart);
 #ifdef SMP
 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
 #else
 	CPU_SET(hart, &pmap->pm_active);
 #endif
 	PCPU_SET(curpmap, pmap);
 }
 
 void
 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
 {
 	cpuset_t mask;
 
 	/*
 	 * From the RISC-V User-Level ISA V2.2:
 	 *
 	 * "To make a store to instruction memory visible to all
 	 * RISC-V harts, the writing hart has to execute a data FENCE
 	 * before requesting that all remote RISC-V harts execute a
 	 * FENCE.I."
 	 *
 	 * However, this is slightly misleading; we still need to
 	 * perform a FENCE.I for the local hart, as FENCE does nothing
 	 * for its icache. FENCE.I alone is also sufficient for the
 	 * local hart.
 	 */
 	sched_pin();
 	mask = all_harts;
 	CPU_CLR(PCPU_GET(hart), &mask);
 	fence_i();
 	if (!CPU_EMPTY(&mask) && smp_started) {
 		fence();
 		sbi_remote_fence_i(mask.__bits);
 	}
 	sched_unpin();
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < L2_SIZE)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & L2_OFFSET;
 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
 	    (*addr & L2_OFFSET) == superpage_offset)
 		return;
 	if ((*addr & L2_OFFSET) < superpage_offset)
 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
 	else
 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
 }
 
 /**
  * Get the kernel virtual address of a set of physical pages. If there are
  * physical addresses not covered by the DMAP perform a transient mapping
  * that will be removed when calling pmap_unmap_io_transient.
  *
  * \param page        The pages the caller wishes to obtain the virtual
  *                    address on the kernel memory map.
  * \param vaddr       On return contains the kernel virtual memory address
  *                    of the pages passed in the page parameter.
  * \param count       Number of pages passed in.
  * \param can_fault   TRUE if the thread using the mapped pages can take
  *                    page faults, FALSE otherwise.
  *
  * \returns TRUE if the caller must call pmap_unmap_io_transient when
  *          finished or FALSE otherwise.
  *
  */
 boolean_t
 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	boolean_t needs_mapping;
 	int error, i;
 
 	/*
 	 * Allocate any KVA space that we need, this is done in a separate
 	 * loop to prevent calling vmem_alloc while pinned.
 	 */
 	needs_mapping = FALSE;
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) {
 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 			needs_mapping = TRUE;
 		} else {
 			vaddr[i] = PHYS_TO_DMAP(paddr);
 		}
 	}
 
 	/* Exit early if everything is covered by the DMAP */
 	if (!needs_mapping)
 		return (FALSE);
 
 	if (!can_fault)
 		sched_pin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= DMAP_MAX_PHYSADDR) {
 			panic(
 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
 		}
 	}
 
 	return (needs_mapping);
 }
 
 void
 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	int i;
 
 	if (!can_fault)
 		sched_unpin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= DMAP_MAX_PHYSADDR) {
 			panic("RISCVTODO: pmap_unmap_io_transient: Unmap data");
 		}
 	}
 }
 
 boolean_t
 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
 {
 
 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK);
 }
 
 bool
 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2,
     pt_entry_t **l3)
 {
 	pd_entry_t *l1p, *l2p;
 
 	/* Get l1 directory entry. */
 	l1p = pmap_l1(pmap, va);
 	*l1 = l1p;
 
 	if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0)
 		return (false);
 
 	if ((pmap_load(l1p) & PTE_RX) != 0) {
 		*l2 = NULL;
 		*l3 = NULL;
 		return (true);
 	}
 
 	/* Get l2 directory entry. */
 	l2p = pmap_l1_to_l2(l1p, va);
 	*l2 = l2p;
 
 	if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0)
 		return (false);
 
 	if ((pmap_load(l2p) & PTE_RX) != 0) {
 		*l3 = NULL;
 		return (true);
 	}
 
 	/* Get l3 page table entry. */
 	*l3 = pmap_l2_to_l3(l2p, va);
 
 	return (true);
 }
 
 /*
  * Track a range of the kernel's virtual address space that is contiguous
  * in various mapping attributes.
  */
 struct pmap_kernel_map_range {
 	vm_offset_t sva;
 	pt_entry_t attrs;
 	int l3pages;
 	int l2pages;
 	int l1pages;
 };
 
 static void
 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
     vm_offset_t eva)
 {
 
 	if (eva <= range->sva)
 		return;
 
 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n",
 	    range->sva, eva,
 	    (range->attrs & PTE_W) == PTE_W ? 'w' : '-',
 	    (range->attrs & PTE_X) == PTE_X ? 'x' : '-',
 	    (range->attrs & PTE_U) == PTE_U ? 'u' : 's',
 	    (range->attrs & PTE_G) == PTE_G ? 'g' : '-',
 	    range->l1pages, range->l2pages, range->l3pages);
 
 	/* Reset to sentinel value. */
 	range->sva = 0xfffffffffffffffful;
 }
 
 /*
  * Determine whether the attributes specified by a page table entry match those
  * being tracked by the current range.
  */
 static bool
 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
 {
 
 	return (range->attrs == attrs);
 }
 
 static void
 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
     pt_entry_t attrs)
 {
 
 	memset(range, 0, sizeof(*range));
 	range->sva = va;
 	range->attrs = attrs;
 }
 
 /*
  * Given a leaf PTE, derive the mapping's attributes. If they do not match
  * those of the current run, dump the address range and its attributes, and
  * begin a new run.
  */
 static void
 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
     vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e)
 {
 	pt_entry_t attrs;
 
 	/* The PTE global bit is inherited by lower levels. */
 	attrs = l1e & PTE_G;
 	if ((l1e & PTE_RWX) != 0)
 		attrs |= l1e & (PTE_RWX | PTE_U);
 	else if (l2e != 0)
 		attrs |= l2e & PTE_G;
 	if ((l2e & PTE_RWX) != 0)
 		attrs |= l2e & (PTE_RWX | PTE_U);
 	else if (l3e != 0)
 		attrs |= l3e & (PTE_RWX | PTE_U | PTE_G);
 
 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
 		sysctl_kmaps_dump(sb, range, va);
 		sysctl_kmaps_reinit(range, va, attrs);
 	}
 }
 
 static int
 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 {
 	struct pmap_kernel_map_range range;
 	struct sbuf sbuf, *sb;
 	pd_entry_t l1e, *l2, l2e;
 	pt_entry_t *l3, l3e;
 	vm_offset_t sva;
 	vm_paddr_t pa;
 	int error, i, j, k;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = &sbuf;
 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
 
 	/* Sentinel value. */
 	range.sva = 0xfffffffffffffffful;
 
 	/*
 	 * Iterate over the kernel page tables without holding the kernel pmap
 	 * lock. Kernel page table pages are never freed, so at worst we will
 	 * observe inconsistencies in the output.
 	 */
 	sva = VM_MIN_KERNEL_ADDRESS;
 	for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) {
 		if (i == pmap_l1_index(DMAP_MIN_ADDRESS))
 			sbuf_printf(sb, "\nDirect map:\n");
 		else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS))
 			sbuf_printf(sb, "\nKernel map:\n");
 
 		l1e = kernel_pmap->pm_l1[i];
 		if ((l1e & PTE_V) == 0) {
 			sysctl_kmaps_dump(sb, &range, sva);
 			sva += L1_SIZE;
 			continue;
 		}
 		if ((l1e & PTE_RWX) != 0) {
 			sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0);
 			range.l1pages++;
 			sva += L1_SIZE;
 			continue;
 		}
 		pa = PTE_TO_PHYS(l1e);
 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
 
 		for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) {
 			l2e = l2[j];
 			if ((l2e & PTE_V) == 0) {
 				sysctl_kmaps_dump(sb, &range, sva);
 				sva += L2_SIZE;
 				continue;
 			}
 			if ((l2e & PTE_RWX) != 0) {
 				sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0);
 				range.l2pages++;
 				sva += L2_SIZE;
 				continue;
 			}
 			pa = PTE_TO_PHYS(l2e);
 			l3 = (pd_entry_t *)PHYS_TO_DMAP(pa);
 
 			for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++,
 			    sva += L3_SIZE) {
 				l3e = l3[k];
 				if ((l3e & PTE_V) == 0) {
 					sysctl_kmaps_dump(sb, &range, sva);
 					continue;
 				}
 				sysctl_kmaps_check(sb, &range, sva,
 				    l1e, l2e, l3e);
 				range.l3pages++;
 			}
 		}
 	}
 
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (error);
 }
 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
     NULL, 0, sysctl_kmaps, "A",
     "Dump kernel address layout");
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index a510295b3c65..68553bda2249 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -1,5859 +1,5854 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2019 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * Copyright (c) 2004-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * uma_core.c  Implementation of the Universal Memory allocator
  *
  * This allocator is intended to replace the multitude of similar object caches
  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  * efficient.  A primary design goal is to return unused memory to the rest of
  * the system.  This will make the system as a whole more flexible due to the
  * ability to move memory to subsystems which most need it instead of leaving
  * pools of reserved memory unused.
  *
  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  * are well known.
  *
  */
 
 /*
  * TODO:
  *	- Improve memory usage for large allocations
  *	- Investigate cache size adjustments
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_param.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/asan.h>
 #include <sys/bitset.h>
 #include <sys/domainset.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/limits.h>
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/msan.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/smr.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_domainset.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_dumpset.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
 #include <ddb/ddb.h>
 
 #ifdef DEBUG_MEMGUARD
 #include <vm/memguard.h>
 #endif
 
 #include <machine/md_var.h>
 
 #ifdef INVARIANTS
 #define	UMA_ALWAYS_CTORDTOR	1
 #else
 #define	UMA_ALWAYS_CTORDTOR	0
 #endif
 
 /*
  * This is the zone and keg from which all zones are spawned.
  */
 static uma_zone_t kegs;
 static uma_zone_t zones;
 
 /*
  * On INVARIANTS builds, the slab contains a second bitset of the same size,
  * "dbg_bits", which is laid out immediately after us_free.
  */
 #ifdef INVARIANTS
 #define	SLAB_BITSETS	2
 #else
 #define	SLAB_BITSETS	1
 #endif
 
 /*
  * These are the two zones from which all offpage uma_slab_ts are allocated.
  *
  * One zone is for slab headers that can represent a larger number of items,
  * making the slabs themselves more efficient, and the other zone is for
  * headers that are smaller and represent fewer items, making the headers more
  * efficient.
  */
 #define	SLABZONE_SIZE(setsize)					\
     (sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS)
 #define	SLABZONE0_SETSIZE	(PAGE_SIZE / 16)
 #define	SLABZONE1_SETSIZE	SLAB_MAX_SETSIZE
 #define	SLABZONE0_SIZE	SLABZONE_SIZE(SLABZONE0_SETSIZE)
 #define	SLABZONE1_SIZE	SLABZONE_SIZE(SLABZONE1_SETSIZE)
 static uma_zone_t slabzones[2];
 
 /*
  * The initial hash tables come out of this zone so they can be allocated
  * prior to malloc coming up.
  */
 static uma_zone_t hashzone;
 
 /* The boot-time adjusted value for cache line alignment. */
 int uma_align_cache = 64 - 1;
 
 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc");
 
 /*
  * Are we allowed to allocate buckets?
  */
 static int bucketdisable = 1;
 
 /* Linked list of all kegs in the system */
 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 
 /* Linked list of all cache-only zones in the system */
 static LIST_HEAD(,uma_zone) uma_cachezones =
     LIST_HEAD_INITIALIZER(uma_cachezones);
 
 /*
  * Mutex for global lists: uma_kegs, uma_cachezones, and the per-keg list of
  * zones.
  */
 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
 
 static struct sx uma_reclaim_lock;
 
 /*
  * First available virual address for boot time allocations.
  */
 static vm_offset_t bootstart;
 static vm_offset_t bootmem;
 
 /*
  * kmem soft limit, initialized by uma_set_limit().  Ensure that early
  * allocations don't trigger a wakeup of the reclaim thread.
  */
 unsigned long uma_kmem_limit = LONG_MAX;
 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
     "UMA kernel memory soft limit");
 unsigned long uma_kmem_total;
 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
     "UMA kernel memory usage");
 
 /* Is the VM done starting up? */
 static enum {
 	BOOT_COLD,
 	BOOT_KVA,
 	BOOT_PCPU,
 	BOOT_RUNNING,
 	BOOT_SHUTDOWN,
 } booted = BOOT_COLD;
 
 /*
  * This is the handle used to schedule events that need to happen
  * outside of the allocation fast path.
  */
 static struct callout uma_callout;
 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
 
 /*
  * This structure is passed as the zone ctor arg so that I don't have to create
  * a special allocation function just for zones.
  */
 struct uma_zctor_args {
 	const char *name;
 	size_t size;
 	uma_ctor ctor;
 	uma_dtor dtor;
 	uma_init uminit;
 	uma_fini fini;
 	uma_import import;
 	uma_release release;
 	void *arg;
 	uma_keg_t keg;
 	int align;
 	uint32_t flags;
 };
 
 struct uma_kctor_args {
 	uma_zone_t zone;
 	size_t size;
 	uma_init uminit;
 	uma_fini fini;
 	int align;
 	uint32_t flags;
 };
 
 struct uma_bucket_zone {
 	uma_zone_t	ubz_zone;
 	const char	*ubz_name;
 	int		ubz_entries;	/* Number of items it can hold. */
 	int		ubz_maxsize;	/* Maximum allocation size per-item. */
 };
 
 /*
  * Compute the actual number of bucket entries to pack them in power
  * of two sizes for more efficient space utilization.
  */
 #define	BUCKET_SIZE(n)						\
     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
 
 #define	BUCKET_MAX	BUCKET_SIZE(256)
 
 struct uma_bucket_zone bucket_zones[] = {
 	/* Literal bucket sizes. */
 	{ NULL, "2 Bucket", 2, 4096 },
 	{ NULL, "4 Bucket", 4, 3072 },
 	{ NULL, "8 Bucket", 8, 2048 },
 	{ NULL, "16 Bucket", 16, 1024 },
 	/* Rounded down power of 2 sizes for efficiency. */
 	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
 	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
 	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
 	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
 	{ NULL, NULL, 0}
 };
 
 /*
  * Flags and enumerations to be passed to internal functions.
  */
 enum zfreeskip {
 	SKIP_NONE =	0,
 	SKIP_CNT =	0x00000001,
 	SKIP_DTOR =	0x00010000,
 	SKIP_FINI =	0x00020000,
 };
 
 /* Prototypes.. */
 
 void	uma_startup1(vm_offset_t);
 void	uma_startup2(void);
 
 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void page_free(void *, vm_size_t, uint8_t);
 static void pcpu_page_free(void *, vm_size_t, uint8_t);
 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_reclaim(uma_zone_t zone, bool, int);
 static bool bucket_cache_reclaim_domain(uma_zone_t, bool, bool, int);
 static int keg_ctor(void *, int, void *, int);
 static void keg_dtor(void *, int, void *);
 static void keg_drain(uma_keg_t keg, int domain);
 static int zone_ctor(void *, int, void *, int);
 static void zone_dtor(void *, int, void *);
 static inline void item_dtor(uma_zone_t zone, void *item, int size,
     void *udata, enum zfreeskip skip);
 static int zero_init(void *, int, int);
 static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
     int itemdomain, bool ws);
 static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *);
 static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *);
 static void zone_timeout(uma_zone_t zone, void *);
 static int hash_alloc(struct uma_hash *, u_int);
 static int hash_expand(struct uma_hash *, struct uma_hash *);
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_shutdown(void);
 static void *zone_alloc_item(uma_zone_t, void *, int, int);
 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 static int zone_alloc_limit(uma_zone_t zone, int count, int flags);
 static void zone_free_limit(uma_zone_t zone, int count);
 static void bucket_enable(void);
 static void bucket_init(void);
 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 static void bucket_zone_drain(int domain);
 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
 static size_t slab_sizeof(int nitems);
 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, uint32_t flags);
 static int zone_import(void *, void **, int, int, int);
 static void zone_release(void *, void **, int);
 static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int);
 static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int);
 
 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS);
 
 static uint64_t uma_zone_get_allocs(uma_zone_t zone);
 
 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Memory allocation debugging");
 
 #ifdef INVARIANTS
 static uint64_t uma_keg_get_allocs(uma_keg_t zone);
 static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg);
 
 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
 
 static u_int dbg_divisor = 1;
 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
     "Debug & thrash every this item in memory allocator");
 
 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
     &uma_dbg_cnt, "memory items debugged");
 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
     &uma_skip_cnt, "memory items skipped, not debugged");
 #endif
 
 SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Universal Memory Allocator");
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT,
     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT,
     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 
 static int zone_warnings = 1;
 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
     "Warn when UMA zones becomes full");
 
 static int multipage_slabs = 1;
 TUNABLE_INT("vm.debug.uma_multipage_slabs", &multipage_slabs);
 SYSCTL_INT(_vm_debug, OID_AUTO, uma_multipage_slabs,
     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &multipage_slabs, 0,
     "UMA may choose larger slab sizes for better efficiency");
 
 /*
  * Select the slab zone for an offpage slab with the given maximum item count.
  */
 static inline uma_zone_t
 slabzone(int ipers)
 {
 
 	return (slabzones[ipers > SLABZONE0_SETSIZE]);
 }
 
 /*
  * This routine checks to see whether or not it's safe to enable buckets.
  */
 static void
 bucket_enable(void)
 {
 
 	KASSERT(booted >= BOOT_KVA, ("Bucket enable before init"));
 	bucketdisable = vm_page_count_min();
 }
 
 /*
  * Initialize bucket_zones, the array of zones of buckets of various sizes.
  *
  * For each zone, calculate the memory required for each bucket, consisting
  * of the header and an array of pointers.
  */
 static void
 bucket_init(void)
 {
 	struct uma_bucket_zone *ubz;
 	int size;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET |
 		    UMA_ZONE_FIRSTTOUCH);
 	}
 }
 
 /*
  * Given a desired number of entries for a bucket, return the zone from which
  * to allocate the bucket.
  */
 static struct uma_bucket_zone *
 bucket_zone_lookup(int entries)
 {
 	struct uma_bucket_zone *ubz;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 		if (ubz->ubz_entries >= entries)
 			return (ubz);
 	ubz--;
 	return (ubz);
 }
 
 static int
 bucket_select(int size)
 {
 	struct uma_bucket_zone *ubz;
 
 	ubz = &bucket_zones[0];
 	if (size > ubz->ubz_maxsize)
 		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
 
 	for (; ubz->ubz_entries != 0; ubz++)
 		if (ubz->ubz_maxsize < size)
 			break;
 	ubz--;
 	return (ubz->ubz_entries);
 }
 
 static uma_bucket_t
 bucket_alloc(uma_zone_t zone, void *udata, int flags)
 {
 	struct uma_bucket_zone *ubz;
 	uma_bucket_t bucket;
 
 	/*
 	 * Don't allocate buckets early in boot.
 	 */
 	if (__predict_false(booted < BOOT_KVA))
 		return (NULL);
 
 	/*
 	 * To limit bucket recursion we store the original zone flags
 	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
 	 * NOVM flag to persist even through deep recursions.  We also
 	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
 	 * a bucket for a bucket zone so we do not allow infinite bucket
 	 * recursion.  This cookie will even persist to frees of unused
 	 * buckets via the allocation path or bucket allocations in the
 	 * free path.
 	 */
 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 		udata = (void *)(uintptr_t)zone->uz_flags;
 	else {
 		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
 			return (NULL);
 		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
 	}
 	if (((uintptr_t)udata & UMA_ZONE_VM) != 0)
 		flags |= M_NOVM;
 	ubz = bucket_zone_lookup(atomic_load_16(&zone->uz_bucket_size));
 	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
 		ubz++;
 	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
 	if (bucket) {
 #ifdef INVARIANTS
 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 #endif
 		bucket->ub_cnt = 0;
 		bucket->ub_entries = min(ubz->ubz_entries,
 		    zone->uz_bucket_size_max);
 		bucket->ub_seq = SMR_SEQ_INVALID;
 		CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p",
 		    zone->uz_name, zone, bucket);
 	}
 
 	return (bucket);
 }
 
 static void
 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
 {
 	struct uma_bucket_zone *ubz;
 
 	if (bucket->ub_cnt != 0)
 		bucket_drain(zone, bucket);
 
 	KASSERT(bucket->ub_cnt == 0,
 	    ("bucket_free: Freeing a non free bucket."));
 	KASSERT(bucket->ub_seq == SMR_SEQ_INVALID,
 	    ("bucket_free: Freeing an SMR bucket."));
 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 		udata = (void *)(uintptr_t)zone->uz_flags;
 	ubz = bucket_zone_lookup(bucket->ub_entries);
 	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
 }
 
 static void
 bucket_zone_drain(int domain)
 {
 	struct uma_bucket_zone *ubz;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 		uma_zone_reclaim_domain(ubz->ubz_zone, UMA_RECLAIM_DRAIN,
 		    domain);
 }
 
 #ifdef KASAN
 _Static_assert(UMA_SMALLEST_UNIT % KASAN_SHADOW_SCALE == 0,
     "Base UMA allocation size not a multiple of the KASAN scale factor");
 
 static void
 kasan_mark_item_valid(uma_zone_t zone, void *item)
 {
 	void *pcpu_item;
 	size_t sz, rsz;
 	int i;
 
 	if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0)
 		return;
 
 	sz = zone->uz_size;
 	rsz = roundup2(sz, KASAN_SHADOW_SCALE);
 	if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) {
 		kasan_mark(item, sz, rsz, KASAN_GENERIC_REDZONE);
 	} else {
 		pcpu_item = zpcpu_base_to_offset(item);
 		for (i = 0; i <= mp_maxid; i++)
 			kasan_mark(zpcpu_get_cpu(pcpu_item, i), sz, rsz,
 			    KASAN_GENERIC_REDZONE);
 	}
 }
 
 static void
 kasan_mark_item_invalid(uma_zone_t zone, void *item)
 {
 	void *pcpu_item;
 	size_t sz;
 	int i;
 
 	if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0)
 		return;
 
 	sz = roundup2(zone->uz_size, KASAN_SHADOW_SCALE);
 	if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) {
 		kasan_mark(item, 0, sz, KASAN_UMA_FREED);
 	} else {
 		pcpu_item = zpcpu_base_to_offset(item);
 		for (i = 0; i <= mp_maxid; i++)
 			kasan_mark(zpcpu_get_cpu(pcpu_item, i), 0, sz,
 			    KASAN_UMA_FREED);
 	}
 }
 
 static void
 kasan_mark_slab_valid(uma_keg_t keg, void *mem)
 {
 	size_t sz;
 
 	if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) {
 		sz = keg->uk_ppera * PAGE_SIZE;
 		kasan_mark(mem, sz, sz, 0);
 	}
 }
 
 static void
 kasan_mark_slab_invalid(uma_keg_t keg, void *mem)
 {
 	size_t sz;
 
 	if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) {
 		if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0)
 			sz = keg->uk_ppera * PAGE_SIZE;
 		else
 			sz = keg->uk_pgoff;
 		kasan_mark(mem, 0, sz, KASAN_UMA_FREED);
 	}
 }
 #else /* !KASAN */
 static void
 kasan_mark_item_valid(uma_zone_t zone __unused, void *item __unused)
 {
 }
 
 static void
 kasan_mark_item_invalid(uma_zone_t zone __unused, void *item __unused)
 {
 }
 
 static void
 kasan_mark_slab_valid(uma_keg_t keg __unused, void *mem __unused)
 {
 }
 
 static void
 kasan_mark_slab_invalid(uma_keg_t keg __unused, void *mem __unused)
 {
 }
 #endif /* KASAN */
 
 #ifdef KMSAN
 static inline void
 kmsan_mark_item_uninitialized(uma_zone_t zone, void *item)
 {
 	void *pcpu_item;
 	size_t sz;
 	int i;
 
 	if ((zone->uz_flags &
 	    (UMA_ZFLAG_CACHE | UMA_ZONE_SECONDARY | UMA_ZONE_MALLOC)) != 0) {
 		/*
 		 * Cache zones should not be instrumented by default, as UMA
 		 * does not have enough information to do so correctly.
 		 * Consumers can mark items themselves if it makes sense to do
 		 * so.
 		 *
 		 * Items from secondary zones are initialized by the parent
 		 * zone and thus cannot safely be marked by UMA.
 		 *
 		 * malloc zones are handled directly by malloc(9) and friends,
 		 * since they can provide more precise origin tracking.
 		 */
 		return;
 	}
 	if (zone->uz_keg->uk_init != NULL) {
 		/*
 		 * By definition, initialized items cannot be marked.  The
 		 * best we can do is mark items from these zones after they
 		 * are freed to the keg.
 		 */
 		return;
 	}
 
 	sz = zone->uz_size;
 	if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) {
 		kmsan_orig(item, sz, KMSAN_TYPE_UMA, KMSAN_RET_ADDR);
 		kmsan_mark(item, sz, KMSAN_STATE_UNINIT);
 	} else {
 		pcpu_item = zpcpu_base_to_offset(item);
 		for (i = 0; i <= mp_maxid; i++) {
 			kmsan_orig(zpcpu_get_cpu(pcpu_item, i), sz,
 			    KMSAN_TYPE_UMA, KMSAN_RET_ADDR);
 			kmsan_mark(zpcpu_get_cpu(pcpu_item, i), sz,
 			    KMSAN_STATE_INITED);
 		}
 	}
 }
 #else /* !KMSAN */
 static inline void
 kmsan_mark_item_uninitialized(uma_zone_t zone __unused, void *item __unused)
 {
 }
 #endif /* KMSAN */
 
 /*
  * Acquire the domain lock and record contention.
  */
 static uma_zone_domain_t
 zone_domain_lock(uma_zone_t zone, int domain)
 {
 	uma_zone_domain_t zdom;
 	bool lockfail;
 
 	zdom = ZDOM_GET(zone, domain);
 	lockfail = false;
 	if (ZDOM_OWNED(zdom))
 		lockfail = true;
 	ZDOM_LOCK(zdom);
 	/* This is unsynchronized.  The counter does not need to be precise. */
 	if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max)
 		zone->uz_bucket_size++;
 	return (zdom);
 }
 
 /*
  * Search for the domain with the least cached items and return it if it
  * is out of balance with the preferred domain.
  */
 static __noinline int
 zone_domain_lowest(uma_zone_t zone, int pref)
 {
 	long least, nitems, prefitems;
 	int domain;
 	int i;
 
 	prefitems = least = LONG_MAX;
 	domain = 0;
 	for (i = 0; i < vm_ndomains; i++) {
 		nitems = ZDOM_GET(zone, i)->uzd_nitems;
 		if (nitems < least) {
 			domain = i;
 			least = nitems;
 		}
 		if (domain == pref)
 			prefitems = nitems;
 	}
 	if (prefitems < least * 2)
 		return (pref);
 
 	return (domain);
 }
 
 /*
  * Search for the domain with the most cached items and return it or the
  * preferred domain if it has enough to proceed.
  */
 static __noinline int
 zone_domain_highest(uma_zone_t zone, int pref)
 {
 	long most, nitems;
 	int domain;
 	int i;
 
 	if (ZDOM_GET(zone, pref)->uzd_nitems > BUCKET_MAX)
 		return (pref);
 
 	most = 0;
 	domain = 0;
 	for (i = 0; i < vm_ndomains; i++) {
 		nitems = ZDOM_GET(zone, i)->uzd_nitems;
 		if (nitems > most) {
 			domain = i;
 			most = nitems;
 		}
 	}
 
 	return (domain);
 }
 
 /*
  * Set the maximum imax value.
  */
 static void
 zone_domain_imax_set(uma_zone_domain_t zdom, int nitems)
 {
 	long old;
 
 	old = zdom->uzd_imax;
 	do {
 		if (old >= nitems)
 			return;
 	} while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0);
 
 	/*
 	 * We are at new maximum, so do the last WSS update for the old
 	 * bimin and prepare to measure next allocation batch.
 	 */
 	if (zdom->uzd_wss < old - zdom->uzd_bimin)
 		zdom->uzd_wss = old - zdom->uzd_bimin;
 	zdom->uzd_bimin = nitems;
 }
 
 /*
  * Attempt to satisfy an allocation by retrieving a full bucket from one of the
  * zone's caches.  If a bucket is found the zone is not locked on return.
  */
 static uma_bucket_t
 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim)
 {
 	uma_bucket_t bucket;
 	long cnt;
 	int i;
 	bool dtor = false;
 
 	ZDOM_LOCK_ASSERT(zdom);
 
 	if ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) == NULL)
 		return (NULL);
 
 	/* SMR Buckets can not be re-used until readers expire. */
 	if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
 	    bucket->ub_seq != SMR_SEQ_INVALID) {
 		if (!smr_poll(zone->uz_smr, bucket->ub_seq, false))
 			return (NULL);
 		bucket->ub_seq = SMR_SEQ_INVALID;
 		dtor = (zone->uz_dtor != NULL) || UMA_ALWAYS_CTORDTOR;
 		if (STAILQ_NEXT(bucket, ub_link) != NULL)
 			zdom->uzd_seq = STAILQ_NEXT(bucket, ub_link)->ub_seq;
 	}
 	STAILQ_REMOVE_HEAD(&zdom->uzd_buckets, ub_link);
 
 	KASSERT(zdom->uzd_nitems >= bucket->ub_cnt,
 	    ("%s: item count underflow (%ld, %d)",
 	    __func__, zdom->uzd_nitems, bucket->ub_cnt));
 	KASSERT(bucket->ub_cnt > 0,
 	    ("%s: empty bucket in bucket cache", __func__));
 	zdom->uzd_nitems -= bucket->ub_cnt;
 
 	if (reclaim) {
 		/*
 		 * Shift the bounds of the current WSS interval to avoid
 		 * perturbing the estimates.
 		 */
 		cnt = lmin(zdom->uzd_bimin, bucket->ub_cnt);
 		atomic_subtract_long(&zdom->uzd_imax, cnt);
 		zdom->uzd_bimin -= cnt;
 		zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt);
 		if (zdom->uzd_limin >= bucket->ub_cnt) {
 			zdom->uzd_limin -= bucket->ub_cnt;
 		} else {
 			zdom->uzd_limin = 0;
 			zdom->uzd_timin = 0;
 		}
 	} else if (zdom->uzd_bimin > zdom->uzd_nitems) {
 		zdom->uzd_bimin = zdom->uzd_nitems;
 		if (zdom->uzd_imin > zdom->uzd_nitems)
 			zdom->uzd_imin = zdom->uzd_nitems;
 	}
 
 	ZDOM_UNLOCK(zdom);
 	if (dtor)
 		for (i = 0; i < bucket->ub_cnt; i++)
 			item_dtor(zone, bucket->ub_bucket[i], zone->uz_size,
 			    NULL, SKIP_NONE);
 
 	return (bucket);
 }
 
 /*
  * Insert a full bucket into the specified cache.  The "ws" parameter indicates
  * whether the bucket's contents should be counted as part of the zone's working
  * set.  The bucket may be freed if it exceeds the bucket limit.
  */
 static void
 zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata,
     const bool ws)
 {
 	uma_zone_domain_t zdom;
 
 	/* We don't cache empty buckets.  This can happen after a reclaim. */
 	if (bucket->ub_cnt == 0)
 		goto out;
 	zdom = zone_domain_lock(zone, domain);
 
 	/*
 	 * Conditionally set the maximum number of items.
 	 */
 	zdom->uzd_nitems += bucket->ub_cnt;
 	if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) {
 		if (ws) {
 			zone_domain_imax_set(zdom, zdom->uzd_nitems);
 		} else {
 			/*
 			 * Shift the bounds of the current WSS interval to
 			 * avoid perturbing the estimates.
 			 */
 			atomic_add_long(&zdom->uzd_imax, bucket->ub_cnt);
 			zdom->uzd_imin += bucket->ub_cnt;
 			zdom->uzd_bimin += bucket->ub_cnt;
 			zdom->uzd_limin += bucket->ub_cnt;
 		}
 		if (STAILQ_EMPTY(&zdom->uzd_buckets))
 			zdom->uzd_seq = bucket->ub_seq;
 
 		/*
 		 * Try to promote reuse of recently used items.  For items
 		 * protected by SMR, try to defer reuse to minimize polling.
 		 */
 		if (bucket->ub_seq == SMR_SEQ_INVALID)
 			STAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
 		else
 			STAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
 		ZDOM_UNLOCK(zdom);
 		return;
 	}
 	zdom->uzd_nitems -= bucket->ub_cnt;
 	ZDOM_UNLOCK(zdom);
 out:
 	bucket_free(zone, bucket, udata);
 }
 
 /* Pops an item out of a per-cpu cache bucket. */
 static inline void *
 cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket)
 {
 	void *item;
 
 	CRITICAL_ASSERT(curthread);
 
 	bucket->ucb_cnt--;
 	item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt];
 #ifdef INVARIANTS
 	bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL;
 	KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
 #endif
 	cache->uc_allocs++;
 
 	return (item);
 }
 
 /* Pushes an item into a per-cpu cache bucket. */
 static inline void
 cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item)
 {
 
 	CRITICAL_ASSERT(curthread);
 	KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL,
 	    ("uma_zfree: Freeing to non free bucket index."));
 
 	bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item;
 	bucket->ucb_cnt++;
 	cache->uc_frees++;
 }
 
 /*
  * Unload a UMA bucket from a per-cpu cache.
  */
 static inline uma_bucket_t
 cache_bucket_unload(uma_cache_bucket_t bucket)
 {
 	uma_bucket_t b;
 
 	b = bucket->ucb_bucket;
 	if (b != NULL) {
 		MPASS(b->ub_entries == bucket->ucb_entries);
 		b->ub_cnt = bucket->ucb_cnt;
 		bucket->ucb_bucket = NULL;
 		bucket->ucb_entries = bucket->ucb_cnt = 0;
 	}
 
 	return (b);
 }
 
 static inline uma_bucket_t
 cache_bucket_unload_alloc(uma_cache_t cache)
 {
 
 	return (cache_bucket_unload(&cache->uc_allocbucket));
 }
 
 static inline uma_bucket_t
 cache_bucket_unload_free(uma_cache_t cache)
 {
 
 	return (cache_bucket_unload(&cache->uc_freebucket));
 }
 
 static inline uma_bucket_t
 cache_bucket_unload_cross(uma_cache_t cache)
 {
 
 	return (cache_bucket_unload(&cache->uc_crossbucket));
 }
 
 /*
  * Load a bucket into a per-cpu cache bucket.
  */
 static inline void
 cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b)
 {
 
 	CRITICAL_ASSERT(curthread);
 	MPASS(bucket->ucb_bucket == NULL);
 	MPASS(b->ub_seq == SMR_SEQ_INVALID);
 
 	bucket->ucb_bucket = b;
 	bucket->ucb_cnt = b->ub_cnt;
 	bucket->ucb_entries = b->ub_entries;
 }
 
 static inline void
 cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b)
 {
 
 	cache_bucket_load(&cache->uc_allocbucket, b);
 }
 
 static inline void
 cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b)
 {
 
 	cache_bucket_load(&cache->uc_freebucket, b);
 }
 
 #ifdef NUMA
 static inline void 
 cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b)
 {
 
 	cache_bucket_load(&cache->uc_crossbucket, b);
 }
 #endif
 
 /*
  * Copy and preserve ucb_spare.
  */
 static inline void
 cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2)
 {
 
 	b1->ucb_bucket = b2->ucb_bucket;
 	b1->ucb_entries = b2->ucb_entries;
 	b1->ucb_cnt = b2->ucb_cnt;
 }
 
 /*
  * Swap two cache buckets.
  */
 static inline void
 cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2)
 {
 	struct uma_cache_bucket b3;
 
 	CRITICAL_ASSERT(curthread);
 
 	cache_bucket_copy(&b3, b1);
 	cache_bucket_copy(b1, b2);
 	cache_bucket_copy(b2, &b3);
 }
 
 /*
  * Attempt to fetch a bucket from a zone on behalf of the current cpu cache.
  */
 static uma_bucket_t
 cache_fetch_bucket(uma_zone_t zone, uma_cache_t cache, int domain)
 {
 	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
 
 	/*
 	 * Avoid the lock if possible.
 	 */
 	zdom = ZDOM_GET(zone, domain);
 	if (zdom->uzd_nitems == 0)
 		return (NULL);
 
 	if ((cache_uz_flags(cache) & UMA_ZONE_SMR) != 0 &&
 	    !smr_poll(zone->uz_smr, zdom->uzd_seq, false))
 		return (NULL);
 
 	/*
 	 * Check the zone's cache of buckets.
 	 */
 	zdom = zone_domain_lock(zone, domain);
 	if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL)
 		return (bucket);
 	ZDOM_UNLOCK(zdom);
 
 	return (NULL);
 }
 
 static void
 zone_log_warning(uma_zone_t zone)
 {
 	static const struct timeval warninterval = { 300, 0 };
 
 	if (!zone_warnings || zone->uz_warning == NULL)
 		return;
 
 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 }
 
 static inline void
 zone_maxaction(uma_zone_t zone)
 {
 
 	if (zone->uz_maxaction.ta_func != NULL)
 		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
 }
 
 /*
  * Routine called by timeout which is used to fire off some time interval
  * based calculations.  (stats, hash size, etc.)
  *
  * Arguments:
  *	arg   Unused
  *
  * Returns:
  *	Nothing
  */
 static void
 uma_timeout(void *unused)
 {
 	bucket_enable();
 	zone_foreach(zone_timeout, NULL);
 
 	/* Reschedule this event */
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 }
 
 /*
  * Update the working set size estimates for the zone's bucket cache.
  * The constants chosen here are somewhat arbitrary.
  */
 static void
 zone_domain_update_wss(uma_zone_domain_t zdom)
 {
 	long m;
 
 	ZDOM_LOCK_ASSERT(zdom);
 	MPASS(zdom->uzd_imax >= zdom->uzd_nitems);
 	MPASS(zdom->uzd_nitems >= zdom->uzd_bimin);
 	MPASS(zdom->uzd_bimin >= zdom->uzd_imin);
 
 	/*
 	 * Estimate WSS as modified moving average of biggest allocation
 	 * batches for each period over few minutes (UMA_TIMEOUT of 20s).
 	 */
 	zdom->uzd_wss = lmax(zdom->uzd_wss * 3 / 4,
 	    zdom->uzd_imax - zdom->uzd_bimin);
 
 	/*
 	 * Estimate longtime minimum item count as a combination of recent
 	 * minimum item count, adjusted by WSS for safety, and the modified
 	 * moving average over the last several hours (UMA_TIMEOUT of 20s).
 	 * timin measures time since limin tried to go negative, that means
 	 * we were dangerously close to or got out of cache.
 	 */
 	m = zdom->uzd_imin - zdom->uzd_wss;
 	if (m >= 0) {
 		if (zdom->uzd_limin >= m)
 			zdom->uzd_limin = m;
 		else
 			zdom->uzd_limin = (m + zdom->uzd_limin * 255) / 256;
 		zdom->uzd_timin++;
 	} else {
 		zdom->uzd_limin = 0;
 		zdom->uzd_timin = 0;
 	}
 
 	/* To reduce period edge effects on WSS keep half of the imax. */
 	atomic_subtract_long(&zdom->uzd_imax,
 	    (zdom->uzd_imax - zdom->uzd_nitems + 1) / 2);
 	zdom->uzd_imin = zdom->uzd_bimin = zdom->uzd_nitems;
 }
 
 /*
  * Routine to perform timeout driven calculations.  This expands the
  * hashes and does per cpu statistics aggregation.
  *
  *  Returns nothing.
  */
 static void
 zone_timeout(uma_zone_t zone, void *unused)
 {
 	uma_keg_t keg;
 	u_int slabs, pages;
 
 	if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
 		goto trim;
 
 	keg = zone->uz_keg;
 
 	/*
 	 * Hash zones are non-numa by definition so the first domain
 	 * is the only one present.
 	 */
 	KEG_LOCK(keg, 0);
 	pages = keg->uk_domain[0].ud_pages;
 
 	/*
 	 * Expand the keg hash table.
 	 *
 	 * This is done if the number of slabs is larger than the hash size.
 	 * What I'm trying to do here is completely reduce collisions.  This
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
 	if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
 		struct uma_hash oldhash;
 		int ret;
 
 		/*
 		 * This is so involved because allocating and freeing
 		 * while the keg lock is held will lead to deadlock.
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
 		KEG_UNLOCK(keg, 0);
 		ret = hash_alloc(&newhash, 1 << fls(slabs));
 		KEG_LOCK(keg, 0);
 		if (ret) {
 			if (hash_expand(&keg->uk_hash, &newhash)) {
 				oldhash = keg->uk_hash;
 				keg->uk_hash = newhash;
 			} else
 				oldhash = newhash;
 
 			KEG_UNLOCK(keg, 0);
 			hash_free(&oldhash);
 			goto trim;
 		}
 	}
 	KEG_UNLOCK(keg, 0);
 
 trim:
 	/* Trim caches not used for a long time. */
 	for (int i = 0; i < vm_ndomains; i++) {
 		if (bucket_cache_reclaim_domain(zone, false, false, i) &&
 		    (zone->uz_flags & UMA_ZFLAG_CACHE) == 0)
 			keg_drain(zone->uz_keg, i);
 	}
 }
 
 /*
  * Allocate and zero fill the next sized hash table from the appropriate
  * backing store.
  *
  * Arguments:
  *	hash  A new hash structure with the old hash size in uh_hashsize
  *
  * Returns:
  *	1 on success and 0 on failure.
  */
 static int
 hash_alloc(struct uma_hash *hash, u_int size)
 {
 	size_t alloc;
 
 	KASSERT(powerof2(size), ("hash size must be power of 2"));
 	if (size > UMA_HASH_SIZE_INIT)  {
 		hash->uh_hashsize = size;
 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 		hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT);
 	} else {
 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 		    UMA_ANYDOMAIN, M_WAITOK);
 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 	}
 	if (hash->uh_slab_hash) {
 		bzero(hash->uh_slab_hash, alloc);
 		hash->uh_hashmask = hash->uh_hashsize - 1;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Expands the hash table for HASH zones.  This is done from zone_timeout
  * to reduce collisions.  This must not be done in the regular allocation
  * path, otherwise, we can recurse on the vm while allocating pages.
  *
  * Arguments:
  *	oldhash  The hash you want to expand
  *	newhash  The hash structure for the new table
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  */
 static int
 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 {
 	uma_hash_slab_t slab;
 	u_int hval;
 	u_int idx;
 
 	if (!newhash->uh_slab_hash)
 		return (0);
 
 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 		return (0);
 
 	/*
 	 * I need to investigate hash algorithms for resizing without a
 	 * full rehash.
 	 */
 
 	for (idx = 0; idx < oldhash->uh_hashsize; idx++)
 		while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
 			slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]);
 			LIST_REMOVE(slab, uhs_hlink);
 			hval = UMA_HASH(newhash, slab->uhs_data);
 			LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 			    slab, uhs_hlink);
 		}
 
 	return (1);
 }
 
 /*
  * Free the hash bucket to the appropriate backing store.
  *
  * Arguments:
  *	slab_hash  The hash bucket we're freeing
  *	hashsize   The number of entries in that hash bucket
  *
  * Returns:
  *	Nothing
  */
 static void
 hash_free(struct uma_hash *hash)
 {
 	if (hash->uh_slab_hash == NULL)
 		return;
 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
 	else
 		free(hash->uh_slab_hash, M_UMAHASH);
 }
 
 /*
  * Frees all outstanding items in a bucket
  *
  * Arguments:
  *	zone   The zone to free to, must be unlocked.
  *	bucket The free/alloc bucket with items.
  *
  * Returns:
  *	Nothing
  */
 static void
 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 {
 	int i;
 
 	if (bucket->ub_cnt == 0)
 		return;
 
 	if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
 	    bucket->ub_seq != SMR_SEQ_INVALID) {
 		smr_wait(zone->uz_smr, bucket->ub_seq);
 		bucket->ub_seq = SMR_SEQ_INVALID;
 		for (i = 0; i < bucket->ub_cnt; i++)
 			item_dtor(zone, bucket->ub_bucket[i],
 			    zone->uz_size, NULL, SKIP_NONE);
 	}
 	if (zone->uz_fini)
 		for (i = 0; i < bucket->ub_cnt; i++) {
 			kasan_mark_item_valid(zone, bucket->ub_bucket[i]);
 			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
 			kasan_mark_item_invalid(zone, bucket->ub_bucket[i]);
 		}
 	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
 	if (zone->uz_max_items > 0)
 		zone_free_limit(zone, bucket->ub_cnt);
 #ifdef INVARIANTS
 	bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt);
 #endif
 	bucket->ub_cnt = 0;
 }
 
 /*
  * Drains the per cpu caches for a zone.
  *
  * NOTE: This may only be called while the zone is being torn down, and not
  * during normal operation.  This is necessary in order that we do not have
  * to migrate CPUs to drain the per-CPU caches.
  *
  * Arguments:
  *	zone     The zone to drain, must be unlocked.
  *
  * Returns:
  *	Nothing
  */
 static void
 cache_drain(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	smr_seq_t seq;
 	int cpu;
 
 	/*
 	 * XXX: It is safe to not lock the per-CPU caches, because we're
 	 * tearing down the zone anyway.  I.e., there will be no further use
 	 * of the caches at this point.
 	 *
 	 * XXX: It would good to be able to assert that the zone is being
 	 * torn down to prevent improper use of cache_drain().
 	 */
 	seq = SMR_SEQ_INVALID;
 	if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
 		seq = smr_advance(zone->uz_smr);
 	CPU_FOREACH(cpu) {
 		cache = &zone->uz_cpu[cpu];
 		bucket = cache_bucket_unload_alloc(cache);
 		if (bucket != NULL)
 			bucket_free(zone, bucket, NULL);
 		bucket = cache_bucket_unload_free(cache);
 		if (bucket != NULL) {
 			bucket->ub_seq = seq;
 			bucket_free(zone, bucket, NULL);
 		}
 		bucket = cache_bucket_unload_cross(cache);
 		if (bucket != NULL) {
 			bucket->ub_seq = seq;
 			bucket_free(zone, bucket, NULL);
 		}
 	}
 	bucket_cache_reclaim(zone, true, UMA_ANYDOMAIN);
 }
 
 static void
 cache_shrink(uma_zone_t zone, void *unused)
 {
 
 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 		return;
 
 	ZONE_LOCK(zone);
 	zone->uz_bucket_size =
 	    (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2;
 	ZONE_UNLOCK(zone);
 }
 
 static void
 cache_drain_safe_cpu(uma_zone_t zone, void *unused)
 {
 	uma_cache_t cache;
 	uma_bucket_t b1, b2, b3;
 	int domain;
 
 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 		return;
 
 	b1 = b2 = b3 = NULL;
 	critical_enter();
 	cache = &zone->uz_cpu[curcpu];
 	domain = PCPU_GET(domain);
 	b1 = cache_bucket_unload_alloc(cache);
 
 	/*
 	 * Don't flush SMR zone buckets.  This leaves the zone without a
 	 * bucket and forces every free to synchronize().
 	 */
 	if ((zone->uz_flags & UMA_ZONE_SMR) == 0) {
 		b2 = cache_bucket_unload_free(cache);
 		b3 = cache_bucket_unload_cross(cache);
 	}
 	critical_exit();
 
 	if (b1 != NULL)
 		zone_free_bucket(zone, b1, NULL, domain, false);
 	if (b2 != NULL)
 		zone_free_bucket(zone, b2, NULL, domain, false);
 	if (b3 != NULL) {
 		/* Adjust the domain so it goes to zone_free_cross. */
 		domain = (domain + 1) % vm_ndomains;
 		zone_free_bucket(zone, b3, NULL, domain, false);
 	}
 }
 
 /*
  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
  * This is an expensive call because it needs to bind to all CPUs
  * one by one and enter a critical section on each of them in order
  * to safely access their cache buckets.
  * Zone lock must not be held on call this function.
  */
 static void
 pcpu_cache_drain_safe(uma_zone_t zone)
 {
 	int cpu;
 
 	/*
 	 * Polite bucket sizes shrinking was not enough, shrink aggressively.
 	 */
 	if (zone)
 		cache_shrink(zone, NULL);
 	else
 		zone_foreach(cache_shrink, NULL);
 
 	CPU_FOREACH(cpu) {
 		thread_lock(curthread);
 		sched_bind(curthread, cpu);
 		thread_unlock(curthread);
 
 		if (zone)
 			cache_drain_safe_cpu(zone, NULL);
 		else
 			zone_foreach(cache_drain_safe_cpu, NULL);
 	}
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 }
 
 /*
  * Reclaim cached buckets from a zone.  All buckets are reclaimed if the caller
  * requested a drain, otherwise the per-domain caches are trimmed to either
  * estimated working set size.
  */
 static bool
 bucket_cache_reclaim_domain(uma_zone_t zone, bool drain, bool trim, int domain)
 {
 	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
 	long target;
 	bool done = false;
 
 	/*
 	 * The cross bucket is partially filled and not part of
 	 * the item count.  Reclaim it individually here.
 	 */
 	zdom = ZDOM_GET(zone, domain);
 	if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) {
 		ZONE_CROSS_LOCK(zone);
 		bucket = zdom->uzd_cross;
 		zdom->uzd_cross = NULL;
 		ZONE_CROSS_UNLOCK(zone);
 		if (bucket != NULL)
 			bucket_free(zone, bucket, NULL);
 	}
 
 	/*
 	 * If we were asked to drain the zone, we are done only once
 	 * this bucket cache is empty.  If trim, we reclaim items in
 	 * excess of the zone's estimated working set size.  Multiple
 	 * consecutive calls will shrink the WSS and so reclaim more.
 	 * If neither drain nor trim, then voluntarily reclaim 1/4
 	 * (to reduce first spike) of items not used for a long time.
 	 */
 	ZDOM_LOCK(zdom);
 	zone_domain_update_wss(zdom);
 	if (drain)
 		target = 0;
 	else if (trim)
 		target = zdom->uzd_wss;
 	else if (zdom->uzd_timin > 900 / UMA_TIMEOUT)
 		target = zdom->uzd_nitems - zdom->uzd_limin / 4;
 	else {
 		ZDOM_UNLOCK(zdom);
 		return (done);
 	}
 	while ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) != NULL &&
 	    zdom->uzd_nitems >= target + bucket->ub_cnt) {
 		bucket = zone_fetch_bucket(zone, zdom, true);
 		if (bucket == NULL)
 			break;
 		bucket_free(zone, bucket, NULL);
 		done = true;
 		ZDOM_LOCK(zdom);
 	}
 	ZDOM_UNLOCK(zdom);
 	return (done);
 }
 
 static void
 bucket_cache_reclaim(uma_zone_t zone, bool drain, int domain)
 {
 	int i;
 
 	/*
 	 * Shrink the zone bucket size to ensure that the per-CPU caches
 	 * don't grow too large.
 	 */
 	if (zone->uz_bucket_size > zone->uz_bucket_size_min)
 		zone->uz_bucket_size--;
 
 	if (domain != UMA_ANYDOMAIN &&
 	    (zone->uz_flags & UMA_ZONE_ROUNDROBIN) == 0) {
 		bucket_cache_reclaim_domain(zone, drain, true, domain);
 	} else {
 		for (i = 0; i < vm_ndomains; i++)
 			bucket_cache_reclaim_domain(zone, drain, true, i);
 	}
 }
 
 static void
 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
 {
 	uint8_t *mem;
 	size_t size;
 	int i;
 	uint8_t flags;
 
 	CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
 	    keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
 
 	mem = slab_data(slab, keg);
 	size = PAGE_SIZE * keg->uk_ppera;
 
 	kasan_mark_slab_valid(keg, mem);
 	if (keg->uk_fini != NULL) {
 		for (i = start - 1; i > -1; i--)
 #ifdef INVARIANTS
 		/*
 		 * trash_fini implies that dtor was trash_dtor. trash_fini
 		 * would check that memory hasn't been modified since free,
 		 * which executed trash_dtor.
 		 * That's why we need to run uma_dbg_kskip() check here,
 		 * albeit we don't make skip check for other init/fini
 		 * invocations.
 		 */
 		if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) ||
 		    keg->uk_fini != trash_fini)
 #endif
 			keg->uk_fini(slab_item(slab, keg, i), keg->uk_size);
 	}
 	flags = slab->us_flags;
 	if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) {
 		zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab),
 		    NULL, SKIP_NONE);
 	}
 	keg->uk_freef(mem, size, flags);
 	uma_total_dec(size);
 }
 
 static void
 keg_drain_domain(uma_keg_t keg, int domain)
 {
 	struct slabhead freeslabs;
 	uma_domain_t dom;
 	uma_slab_t slab, tmp;
 	uint32_t i, stofree, stokeep, partial;
 
 	dom = &keg->uk_domain[domain];
 	LIST_INIT(&freeslabs);
 
 	CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u",
 	    keg->uk_name, keg, domain, dom->ud_free_items);
 
 	KEG_LOCK(keg, domain);
 
 	/*
 	 * Are the free items in partially allocated slabs sufficient to meet
 	 * the reserve? If not, compute the number of fully free slabs that must
 	 * be kept.
 	 */
 	partial = dom->ud_free_items - dom->ud_free_slabs * keg->uk_ipers;
 	if (partial < keg->uk_reserve) {
 		stokeep = min(dom->ud_free_slabs,
 		    howmany(keg->uk_reserve - partial, keg->uk_ipers));
 	} else {
 		stokeep = 0;
 	}
 	stofree = dom->ud_free_slabs - stokeep;
 
 	/*
 	 * Partition the free slabs into two sets: those that must be kept in
 	 * order to maintain the reserve, and those that may be released back to
 	 * the system.  Since one set may be much larger than the other,
 	 * populate the smaller of the two sets and swap them if necessary.
 	 */
 	for (i = min(stofree, stokeep); i > 0; i--) {
 		slab = LIST_FIRST(&dom->ud_free_slab);
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&freeslabs, slab, us_link);
 	}
 	if (stofree > stokeep)
 		LIST_SWAP(&freeslabs, &dom->ud_free_slab, uma_slab, us_link);
 
 	if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) {
 		LIST_FOREACH(slab, &freeslabs, us_link)
 			UMA_HASH_REMOVE(&keg->uk_hash, slab);
 	}
 	dom->ud_free_items -= stofree * keg->uk_ipers;
 	dom->ud_free_slabs -= stofree;
 	dom->ud_pages -= stofree * keg->uk_ppera;
 	KEG_UNLOCK(keg, domain);
 
 	LIST_FOREACH_SAFE(slab, &freeslabs, us_link, tmp)
 		keg_free_slab(keg, slab, keg->uk_ipers);
 }
 
 /*
  * Frees pages from a keg back to the system.  This is done on demand from
  * the pageout daemon.
  *
  * Returns nothing.
  */
 static void
 keg_drain(uma_keg_t keg, int domain)
 {
 	int i;
 
 	if ((keg->uk_flags & UMA_ZONE_NOFREE) != 0)
 		return;
 	if (domain != UMA_ANYDOMAIN) {
 		keg_drain_domain(keg, domain);
 	} else {
 		for (i = 0; i < vm_ndomains; i++)
 			keg_drain_domain(keg, i);
 	}
 }
 
 static void
 zone_reclaim(uma_zone_t zone, int domain, int waitok, bool drain)
 {
 	/*
 	 * Count active reclaim operations in order to interlock with
 	 * zone_dtor(), which removes the zone from global lists before
 	 * attempting to reclaim items itself.
 	 *
 	 * The zone may be destroyed while sleeping, so only zone_dtor() should
 	 * specify M_WAITOK.
 	 */
 	ZONE_LOCK(zone);
 	if (waitok == M_WAITOK) {
 		while (zone->uz_reclaimers > 0)
 			msleep(zone, ZONE_LOCKPTR(zone), PVM, "zonedrain", 1);
 	}
 	zone->uz_reclaimers++;
 	ZONE_UNLOCK(zone);
 	bucket_cache_reclaim(zone, drain, domain);
 
 	if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0)
 		keg_drain(zone->uz_keg, domain);
 	ZONE_LOCK(zone);
 	zone->uz_reclaimers--;
 	if (zone->uz_reclaimers == 0)
 		wakeup(zone);
 	ZONE_UNLOCK(zone);
 }
 
 static void
 zone_drain(uma_zone_t zone, void *arg)
 {
 	int domain;
 
 	domain = (int)(uintptr_t)arg;
 	zone_reclaim(zone, domain, M_NOWAIT, true);
 }
 
 static void
 zone_trim(uma_zone_t zone, void *arg)
 {
 	int domain;
 
 	domain = (int)(uintptr_t)arg;
 	zone_reclaim(zone, domain, M_NOWAIT, false);
 }
 
 /*
  * Allocate a new slab for a keg and inserts it into the partial slab list.
  * The keg should be unlocked on entry.  If the allocation succeeds it will
  * be locked on return.
  *
  * Arguments:
  *	flags   Wait flags for the item initialization routine
  *	aflags  Wait flags for the slab allocation
  *
  * Returns:
  *	The slab that was allocated or NULL if there is no memory and the
  *	caller specified M_NOWAIT.
  */
 static uma_slab_t
 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
     int aflags)
 {
 	uma_domain_t dom;
 	uma_slab_t slab;
 	unsigned long size;
 	uint8_t *mem;
 	uint8_t sflags;
 	int i;
 
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("keg_alloc_slab: domain %d out of range", domain));
 
 	slab = NULL;
 	mem = NULL;
 	if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) {
 		uma_hash_slab_t hslab;
 		hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL,
 		    domain, aflags);
 		if (hslab == NULL)
 			goto fail;
 		slab = &hslab->uhs_slab;
 	}
 
 	/*
 	 * This reproduces the old vm_zone behavior of zero filling pages the
 	 * first time they are added to a zone.
 	 *
 	 * Malloced items are zeroed in uma_zalloc.
 	 */
 
 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 		aflags |= M_ZERO;
 	else
 		aflags &= ~M_ZERO;
 
 	if (keg->uk_flags & UMA_ZONE_NODUMP)
 		aflags |= M_NODUMP;
 
 	/* zone is passed for legacy reasons. */
 	size = keg->uk_ppera * PAGE_SIZE;
 	mem = keg->uk_allocf(zone, size, domain, &sflags, aflags);
 	if (mem == NULL) {
 		if (keg->uk_flags & UMA_ZFLAG_OFFPAGE)
 			zone_free_item(slabzone(keg->uk_ipers),
 			    slab_tohashslab(slab), NULL, SKIP_NONE);
 		goto fail;
 	}
 	uma_total_inc(size);
 
 	/* For HASH zones all pages go to the same uma_domain. */
 	if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0)
 		domain = 0;
 
 	/* Point the slab into the allocated memory */
 	if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE))
 		slab = (uma_slab_t)(mem + keg->uk_pgoff);
 	else
 		slab_tohashslab(slab)->uhs_data = mem;
 
 	if (keg->uk_flags & UMA_ZFLAG_VTOSLAB)
 		for (i = 0; i < keg->uk_ppera; i++)
 			vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE),
 			    zone, slab);
 
 	slab->us_freecount = keg->uk_ipers;
 	slab->us_flags = sflags;
 	slab->us_domain = domain;
 
 	BIT_FILL(keg->uk_ipers, &slab->us_free);
 #ifdef INVARIANTS
 	BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg));
 #endif
 
 	if (keg->uk_init != NULL) {
 		for (i = 0; i < keg->uk_ipers; i++)
 			if (keg->uk_init(slab_item(slab, keg, i),
 			    keg->uk_size, flags) != 0)
 				break;
 		if (i != keg->uk_ipers) {
 			keg_free_slab(keg, slab, i);
 			goto fail;
 		}
 	}
 	kasan_mark_slab_invalid(keg, mem);
 	KEG_LOCK(keg, domain);
 
 	CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
 	    slab, keg->uk_name, keg);
 
 	if (keg->uk_flags & UMA_ZFLAG_HASH)
 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 
 	/*
 	 * If we got a slab here it's safe to mark it partially used
 	 * and return.  We assume that the caller is going to remove
 	 * at least one item.
 	 */
 	dom = &keg->uk_domain[domain];
 	LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 	dom->ud_pages += keg->uk_ppera;
 	dom->ud_free_items += keg->uk_ipers;
 
 	return (slab);
 
 fail:
 	return (NULL);
 }
 
 /*
  * This function is intended to be used early on in place of page_alloc().  It
  * performs contiguous physical memory allocations and uses a bump allocator for
  * KVA, so is usable before the kernel map is initialized.
  */
 static void *
 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
     int wait)
 {
 	vm_paddr_t pa;
 	vm_page_t m;
-	void *mem;
-	int pages;
-	int i;
+	int i, pages;
 
 	pages = howmany(bytes, PAGE_SIZE);
 	KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
 
 	*pflag = UMA_SLAB_BOOT;
-	m = vm_page_alloc_contig_domain(NULL, 0, domain,
-	    malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, pages, 
-	    (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT);
+	m = vm_page_alloc_noobj_contig_domain(domain, malloc2vm_flags(wait) |
+	    VM_ALLOC_WIRED, pages, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0,
+	    VM_MEMATTR_DEFAULT);
 	if (m == NULL)
 		return (NULL);
 
 	pa = VM_PAGE_TO_PHYS(m);
 	for (i = 0; i < pages; i++, pa += PAGE_SIZE) {
 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
     defined(__riscv) || defined(__powerpc64__)
 		if ((wait & M_NODUMP) == 0)
 			dump_add_page(pa);
 #endif
 	}
-	/* Allocate KVA and indirectly advance bootmem. */
-	mem = (void *)pmap_map(&bootmem, m->phys_addr,
-	    m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE);
-        if ((wait & M_ZERO) != 0)
-                bzero(mem, pages * PAGE_SIZE);
 
-        return (mem);
+	/* Allocate KVA and indirectly advance bootmem. */
+	return ((void *)pmap_map(&bootmem, m->phys_addr,
+	    m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE));
 }
 
 static void
 startup_free(void *mem, vm_size_t bytes)
 {
 	vm_offset_t va;
 	vm_page_t m;
 
 	va = (vm_offset_t)mem;
 	m = PHYS_TO_VM_PAGE(pmap_kextract(va));
 
 	/*
 	 * startup_alloc() returns direct-mapped slabs on some platforms.  Avoid
 	 * unmapping ranges of the direct map.
 	 */
 	if (va >= bootstart && va + bytes <= bootmem)
 		pmap_remove(kernel_pmap, va, va + bytes);
 	for (; bytes != 0; bytes -= PAGE_SIZE, m++) {
 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
     defined(__riscv) || defined(__powerpc64__)
 		dump_drop_page(VM_PAGE_TO_PHYS(m));
 #endif
 		vm_page_unwire_noq(m);
 		vm_page_free(m);
 	}
 }
 
 /*
  * Allocates a number of pages from the system
  *
  * Arguments:
  *	bytes  The number of bytes requested
  *	wait  Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
     int wait)
 {
 	void *p;	/* Returned page */
 
 	*pflag = UMA_SLAB_KERNEL;
 	p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
 
 	return (p);
 }
 
 static void *
 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
     int wait)
 {
 	struct pglist alloctail;
 	vm_offset_t addr, zkva;
 	int cpu, flags;
 	vm_page_t p, p_next;
 #ifdef NUMA
 	struct pcpu *pc;
 #endif
 
 	MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
 
 	TAILQ_INIT(&alloctail);
 	flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | malloc2vm_flags(wait);
 	*pflag = UMA_SLAB_KERNEL;
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (CPU_ABSENT(cpu)) {
 			p = vm_page_alloc_noobj(flags);
 		} else {
 #ifndef NUMA
 			p = vm_page_alloc_noobj(flags);
 #else
 			pc = pcpu_find(cpu);
 			if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain)))
 				p = NULL;
 			else
 				p = vm_page_alloc_noobj_domain(pc->pc_domain,
 				    flags);
 			if (__predict_false(p == NULL))
 				p = vm_page_alloc_noobj(flags);
 #endif
 		}
 		if (__predict_false(p == NULL))
 			goto fail;
 		TAILQ_INSERT_TAIL(&alloctail, p, listq);
 	}
 	if ((addr = kva_alloc(bytes)) == 0)
 		goto fail;
 	zkva = addr;
 	TAILQ_FOREACH(p, &alloctail, listq) {
 		pmap_qenter(zkva, &p, 1);
 		zkva += PAGE_SIZE;
 	}
 	return ((void*)addr);
 fail:
 	TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
 		vm_page_unwire_noq(p);
 		vm_page_free(p);
 	}
 	return (NULL);
 }
 
 /*
  * Allocates a number of pages from within an object
  *
  * Arguments:
  *	bytes  The number of bytes requested
  *	wait   Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
     int wait)
 {
 	TAILQ_HEAD(, vm_page) alloctail;
 	u_long npages;
 	vm_offset_t retkva, zkva;
 	vm_page_t p, p_next;
 	uma_keg_t keg;
 	int req;
 
 	TAILQ_INIT(&alloctail);
 	keg = zone->uz_keg;
 	req = VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED;
 	if ((wait & M_WAITOK) != 0)
 		req |= VM_ALLOC_WAITOK;
 
 	npages = howmany(bytes, PAGE_SIZE);
 	while (npages > 0) {
 		p = vm_page_alloc_noobj_domain(domain, req);
 		if (p != NULL) {
 			/*
 			 * Since the page does not belong to an object, its
 			 * listq is unused.
 			 */
 			TAILQ_INSERT_TAIL(&alloctail, p, listq);
 			npages--;
 			continue;
 		}
 		/*
 		 * Page allocation failed, free intermediate pages and
 		 * exit.
 		 */
 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
 			vm_page_unwire_noq(p);
 			vm_page_free(p); 
 		}
 		return (NULL);
 	}
 	*flags = UMA_SLAB_PRIV;
 	zkva = keg->uk_kva +
 	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
 	retkva = zkva;
 	TAILQ_FOREACH(p, &alloctail, listq) {
 		pmap_qenter(zkva, &p, 1);
 		zkva += PAGE_SIZE;
 	}
 
 	return ((void *)retkva);
 }
 
 /*
  * Allocate physically contiguous pages.
  */
 static void *
 contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
     int wait)
 {
 
 	*pflag = UMA_SLAB_KERNEL;
 	return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
 	    bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
 }
 
 /*
  * Frees a number of pages to the system
  *
  * Arguments:
  *	mem   A pointer to the memory to be freed
  *	size  The size of the memory being freed
  *	flags The original p->us_flags field
  *
  * Returns:
  *	Nothing
  */
 static void
 page_free(void *mem, vm_size_t size, uint8_t flags)
 {
 
 	if ((flags & UMA_SLAB_BOOT) != 0) {
 		startup_free(mem, size);
 		return;
 	}
 
 	KASSERT((flags & UMA_SLAB_KERNEL) != 0,
 	    ("UMA: page_free used with invalid flags %x", flags));
 
 	kmem_free((vm_offset_t)mem, size);
 }
 
 /*
  * Frees pcpu zone allocations
  *
  * Arguments:
  *	mem   A pointer to the memory to be freed
  *	size  The size of the memory being freed
  *	flags The original p->us_flags field
  *
  * Returns:
  *	Nothing
  */
 static void
 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
 {
 	vm_offset_t sva, curva;
 	vm_paddr_t paddr;
 	vm_page_t m;
 
 	MPASS(size == (mp_maxid+1)*PAGE_SIZE);
 
 	if ((flags & UMA_SLAB_BOOT) != 0) {
 		startup_free(mem, size);
 		return;
 	}
 
 	sva = (vm_offset_t)mem;
 	for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
 		paddr = pmap_kextract(curva);
 		m = PHYS_TO_VM_PAGE(paddr);
 		vm_page_unwire_noq(m);
 		vm_page_free(m);
 	}
 	pmap_qremove(sva, size >> PAGE_SHIFT);
 	kva_free(sva, size);
 }
 
 /*
  * Zero fill initializer
  *
  * Arguments/Returns follow uma_init specifications
  */
 static int
 zero_init(void *mem, int size, int flags)
 {
 	bzero(mem, size);
 	return (0);
 }
 
 #ifdef INVARIANTS
 static struct noslabbits *
 slab_dbg_bits(uma_slab_t slab, uma_keg_t keg)
 {
 
 	return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers)));
 }
 #endif
 
 /*
  * Actual size of embedded struct slab (!OFFPAGE).
  */
 static size_t
 slab_sizeof(int nitems)
 {
 	size_t s;
 
 	s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS;
 	return (roundup(s, UMA_ALIGN_PTR + 1));
 }
 
 #define	UMA_FIXPT_SHIFT	31
 #define	UMA_FRAC_FIXPT(n, d)						\
 	((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d)))
 #define	UMA_FIXPT_PCT(f)						\
 	((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT))
 #define	UMA_PCT_FIXPT(pct)	UMA_FRAC_FIXPT((pct), 100)
 #define	UMA_MIN_EFF	UMA_PCT_FIXPT(100 - UMA_MAX_WASTE)
 
 /*
  * Compute the number of items that will fit in a slab.  If hdr is true, the
  * item count may be limited to provide space in the slab for an inline slab
  * header.  Otherwise, all slab space will be provided for item storage.
  */
 static u_int
 slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr)
 {
 	u_int ipers;
 	u_int padpi;
 
 	/* The padding between items is not needed after the last item. */
 	padpi = rsize - size;
 
 	if (hdr) {
 		/*
 		 * Start with the maximum item count and remove items until
 		 * the slab header first alongside the allocatable memory.
 		 */
 		for (ipers = MIN(SLAB_MAX_SETSIZE,
 		    (slabsize + padpi - slab_sizeof(1)) / rsize);
 		    ipers > 0 &&
 		    ipers * rsize - padpi + slab_sizeof(ipers) > slabsize;
 		    ipers--)
 			continue;
 	} else {
 		ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE);
 	}
 
 	return (ipers);
 }
 
 struct keg_layout_result {
 	u_int format;
 	u_int slabsize;
 	u_int ipers;
 	u_int eff;
 };
 
 static void
 keg_layout_one(uma_keg_t keg, u_int rsize, u_int slabsize, u_int fmt,
     struct keg_layout_result *kl)
 {
 	u_int total;
 
 	kl->format = fmt;
 	kl->slabsize = slabsize;
 
 	/* Handle INTERNAL as inline with an extra page. */
 	if ((fmt & UMA_ZFLAG_INTERNAL) != 0) {
 		kl->format &= ~UMA_ZFLAG_INTERNAL;
 		kl->slabsize += PAGE_SIZE;
 	}
 
 	kl->ipers = slab_ipers_hdr(keg->uk_size, rsize, kl->slabsize,
 	    (fmt & UMA_ZFLAG_OFFPAGE) == 0);
 
 	/* Account for memory used by an offpage slab header. */
 	total = kl->slabsize;
 	if ((fmt & UMA_ZFLAG_OFFPAGE) != 0)
 		total += slabzone(kl->ipers)->uz_keg->uk_rsize;
 
 	kl->eff = UMA_FRAC_FIXPT(kl->ipers * rsize, total);
 }
 
 /*
  * Determine the format of a uma keg.  This determines where the slab header
  * will be placed (inline or offpage) and calculates ipers, rsize, and ppera.
  *
  * Arguments
  *	keg  The zone we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 keg_layout(uma_keg_t keg)
 {
 	struct keg_layout_result kl = {}, kl_tmp;
 	u_int fmts[2];
 	u_int alignsize;
 	u_int nfmt;
 	u_int pages;
 	u_int rsize;
 	u_int slabsize;
 	u_int i, j;
 
 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
 	    (keg->uk_size <= UMA_PCPU_ALLOC_SIZE &&
 	     (keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0),
 	    ("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b",
 	     __func__, keg->uk_name, keg->uk_size, keg->uk_flags,
 	     PRINT_UMA_ZFLAGS));
 	KASSERT((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) == 0 ||
 	    (keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0,
 	    ("%s: incompatible flags 0x%b", __func__, keg->uk_flags,
 	     PRINT_UMA_ZFLAGS));
 
 	alignsize = keg->uk_align + 1;
 #ifdef KASAN
 	/*
 	 * ASAN requires that each allocation be aligned to the shadow map
 	 * scale factor.
 	 */
 	if (alignsize < KASAN_SHADOW_SCALE)
 		alignsize = KASAN_SHADOW_SCALE;
 #endif
 
 	/*
 	 * Calculate the size of each allocation (rsize) according to
 	 * alignment.  If the requested size is smaller than we have
 	 * allocation bits for we round it up.
 	 */
 	rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT);
 	rsize = roundup2(rsize, alignsize);
 
 	if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) {
 		/*
 		 * We want one item to start on every align boundary in a page.
 		 * To do this we will span pages.  We will also extend the item
 		 * by the size of align if it is an even multiple of align.
 		 * Otherwise, it would fall on the same boundary every time.
 		 */
 		if ((rsize & alignsize) == 0)
 			rsize += alignsize;
 		slabsize = rsize * (PAGE_SIZE / alignsize);
 		slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE);
 		slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE);
 		slabsize = round_page(slabsize);
 	} else {
 		/*
 		 * Start with a slab size of as many pages as it takes to
 		 * represent a single item.  We will try to fit as many
 		 * additional items into the slab as possible.
 		 */
 		slabsize = round_page(keg->uk_size);
 	}
 
 	/* Build a list of all of the available formats for this keg. */
 	nfmt = 0;
 
 	/* Evaluate an inline slab layout. */
 	if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0)
 		fmts[nfmt++] = 0;
 
 	/* TODO: vm_page-embedded slab. */
 
 	/*
 	 * We can't do OFFPAGE if we're internal or if we've been
 	 * asked to not go to the VM for buckets.  If we do this we
 	 * may end up going to the VM for slabs which we do not want
 	 * to do if we're UMA_ZONE_VM, which clearly forbids it.
 	 * In those cases, evaluate a pseudo-format called INTERNAL
 	 * which has an inline slab header and one extra page to
 	 * guarantee that it fits.
 	 *
 	 * Otherwise, see if using an OFFPAGE slab will improve our
 	 * efficiency.
 	 */
 	if ((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) != 0)
 		fmts[nfmt++] = UMA_ZFLAG_INTERNAL;
 	else
 		fmts[nfmt++] = UMA_ZFLAG_OFFPAGE;
 
 	/*
 	 * Choose a slab size and format which satisfy the minimum efficiency.
 	 * Prefer the smallest slab size that meets the constraints.
 	 *
 	 * Start with a minimum slab size, to accommodate CACHESPREAD.  Then,
 	 * for small items (up to PAGE_SIZE), the iteration increment is one
 	 * page; and for large items, the increment is one item.
 	 */
 	i = (slabsize + rsize - keg->uk_size) / MAX(PAGE_SIZE, rsize);
 	KASSERT(i >= 1, ("keg %s(%p) flags=0x%b slabsize=%u, rsize=%u, i=%u",
 	    keg->uk_name, keg, keg->uk_flags, PRINT_UMA_ZFLAGS, slabsize,
 	    rsize, i));
 	for ( ; ; i++) {
 		slabsize = (rsize <= PAGE_SIZE) ? ptoa(i) :
 		    round_page(rsize * (i - 1) + keg->uk_size);
 
 		for (j = 0; j < nfmt; j++) {
 			/* Only if we have no viable format yet. */
 			if ((fmts[j] & UMA_ZFLAG_INTERNAL) != 0 &&
 			    kl.ipers > 0)
 				continue;
 
 			keg_layout_one(keg, rsize, slabsize, fmts[j], &kl_tmp);
 			if (kl_tmp.eff <= kl.eff)
 				continue;
 
 			kl = kl_tmp;
 
 			CTR6(KTR_UMA, "keg %s layout: format %#x "
 			    "(ipers %u * rsize %u) / slabsize %#x = %u%% eff",
 			    keg->uk_name, kl.format, kl.ipers, rsize,
 			    kl.slabsize, UMA_FIXPT_PCT(kl.eff));
 
 			/* Stop when we reach the minimum efficiency. */
 			if (kl.eff >= UMA_MIN_EFF)
 				break;
 		}
 
 		if (kl.eff >= UMA_MIN_EFF || !multipage_slabs ||
 		    slabsize >= SLAB_MAX_SETSIZE * rsize ||
 		    (keg->uk_flags & (UMA_ZONE_PCPU | UMA_ZONE_CONTIG)) != 0)
 			break;
 	}
 
 	pages = atop(kl.slabsize);
 	if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
 		pages *= mp_maxid + 1;
 
 	keg->uk_rsize = rsize;
 	keg->uk_ipers = kl.ipers;
 	keg->uk_ppera = pages;
 	keg->uk_flags |= kl.format;
 
 	/*
 	 * How do we find the slab header if it is offpage or if not all item
 	 * start addresses are in the same page?  We could solve the latter
 	 * case with vaddr alignment, but we don't.
 	 */
 	if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0 ||
 	    (keg->uk_ipers - 1) * rsize >= PAGE_SIZE) {
 		if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0)
 			keg->uk_flags |= UMA_ZFLAG_HASH;
 		else
 			keg->uk_flags |= UMA_ZFLAG_VTOSLAB;
 	}
 
 	CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u",
 	    __func__, keg->uk_name, keg->uk_flags, rsize, keg->uk_ipers,
 	    pages);
 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE,
 	    ("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__,
 	     keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize,
 	     keg->uk_ipers, pages));
 }
 
 /*
  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
  * the keg onto the global keg list.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_kctor_args
  */
 static int
 keg_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_kctor_args *arg = udata;
 	uma_keg_t keg = mem;
 	uma_zone_t zone;
 	int i;
 
 	bzero(keg, size);
 	keg->uk_size = arg->size;
 	keg->uk_init = arg->uminit;
 	keg->uk_fini = arg->fini;
 	keg->uk_align = arg->align;
 	keg->uk_reserve = 0;
 	keg->uk_flags = arg->flags;
 
 	/*
 	 * We use a global round-robin policy by default.  Zones with
 	 * UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which
 	 * case the iterator is never run.
 	 */
 	keg->uk_dr.dr_policy = DOMAINSET_RR();
 	keg->uk_dr.dr_iter = 0;
 
 	/*
 	 * The primary zone is passed to us at keg-creation time.
 	 */
 	zone = arg->zone;
 	keg->uk_name = zone->uz_name;
 
 	if (arg->flags & UMA_ZONE_ZINIT)
 		keg->uk_init = zero_init;
 
 	if (arg->flags & UMA_ZONE_MALLOC)
 		keg->uk_flags |= UMA_ZFLAG_VTOSLAB;
 
 #ifndef SMP
 	keg->uk_flags &= ~UMA_ZONE_PCPU;
 #endif
 
 	keg_layout(keg);
 
 	/*
 	 * Use a first-touch NUMA policy for kegs that pmap_extract() will
 	 * work on.  Use round-robin for everything else.
 	 *
 	 * Zones may override the default by specifying either.
 	 */
 #ifdef NUMA
 	if ((keg->uk_flags &
 	    (UMA_ZONE_ROUNDROBIN | UMA_ZFLAG_CACHE | UMA_ZONE_NOTPAGE)) == 0)
 		keg->uk_flags |= UMA_ZONE_FIRSTTOUCH;
 	else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0)
 		keg->uk_flags |= UMA_ZONE_ROUNDROBIN;
 #endif
 
 	/*
 	 * If we haven't booted yet we need allocations to go through the
 	 * startup cache until the vm is ready.
 	 */
 #ifdef UMA_MD_SMALL_ALLOC
 	if (keg->uk_ppera == 1)
 		keg->uk_allocf = uma_small_alloc;
 	else
 #endif
 	if (booted < BOOT_KVA)
 		keg->uk_allocf = startup_alloc;
 	else if (keg->uk_flags & UMA_ZONE_PCPU)
 		keg->uk_allocf = pcpu_page_alloc;
 	else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1)
 		keg->uk_allocf = contig_alloc;
 	else
 		keg->uk_allocf = page_alloc;
 #ifdef UMA_MD_SMALL_ALLOC
 	if (keg->uk_ppera == 1)
 		keg->uk_freef = uma_small_free;
 	else
 #endif
 	if (keg->uk_flags & UMA_ZONE_PCPU)
 		keg->uk_freef = pcpu_page_free;
 	else
 		keg->uk_freef = page_free;
 
 	/*
 	 * Initialize keg's locks.
 	 */
 	for (i = 0; i < vm_ndomains; i++)
 		KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS));
 
 	/*
 	 * If we're putting the slab header in the actual page we need to
 	 * figure out where in each page it goes.  See slab_sizeof
 	 * definition.
 	 */
 	if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) {
 		size_t shsize;
 
 		shsize = slab_sizeof(keg->uk_ipers);
 		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize;
 		/*
 		 * The only way the following is possible is if with our
 		 * UMA_ALIGN_PTR adjustments we are now bigger than
 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
 		 * mathematically possible for all cases, so we make
 		 * sure here anyway.
 		 */
 		KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera,
 		    ("zone %s ipers %d rsize %d size %d slab won't fit",
 		    zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
 	}
 
 	if (keg->uk_flags & UMA_ZFLAG_HASH)
 		hash_alloc(&keg->uk_hash, 0);
 
 	CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone);
 
 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
 
 	rw_wlock(&uma_rwlock);
 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
 	rw_wunlock(&uma_rwlock);
 	return (0);
 }
 
 static void
 zone_kva_available(uma_zone_t zone, void *unused)
 {
 	uma_keg_t keg;
 
 	if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
 		return;
 	KEG_GET(zone, keg);
 
 	if (keg->uk_allocf == startup_alloc) {
 		/* Switch to the real allocator. */
 		if (keg->uk_flags & UMA_ZONE_PCPU)
 			keg->uk_allocf = pcpu_page_alloc;
 		else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 &&
 		    keg->uk_ppera > 1)
 			keg->uk_allocf = contig_alloc;
 		else
 			keg->uk_allocf = page_alloc;
 	}
 }
 
 static void
 zone_alloc_counters(uma_zone_t zone, void *unused)
 {
 
 	zone->uz_allocs = counter_u64_alloc(M_WAITOK);
 	zone->uz_frees = counter_u64_alloc(M_WAITOK);
 	zone->uz_fails = counter_u64_alloc(M_WAITOK);
 	zone->uz_xdomain = counter_u64_alloc(M_WAITOK);
 }
 
 static void
 zone_alloc_sysctl(uma_zone_t zone, void *unused)
 {
 	uma_zone_domain_t zdom;
 	uma_domain_t dom;
 	uma_keg_t keg;
 	struct sysctl_oid *oid, *domainoid;
 	int domains, i, cnt;
 	static const char *nokeg = "cache zone";
 	char *c;
 
 	/*
 	 * Make a sysctl safe copy of the zone name by removing
 	 * any special characters and handling dups by appending
 	 * an index.
 	 */
 	if (zone->uz_namecnt != 0) {
 		/* Count the number of decimal digits and '_' separator. */
 		for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++)
 			cnt /= 10;
 		zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1,
 		    M_UMA, M_WAITOK);
 		sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name,
 		    zone->uz_namecnt);
 	} else
 		zone->uz_ctlname = strdup(zone->uz_name, M_UMA);
 	for (c = zone->uz_ctlname; *c != '\0'; c++)
 		if (strchr("./\\ -", *c) != NULL)
 			*c = '_';
 
 	/*
 	 * Basic parameters at the root.
 	 */
 	zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma),
 	    OID_AUTO, zone->uz_ctlname, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	oid = zone->uz_oid;
 	SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE,
 	    zone, 0, sysctl_handle_uma_zone_flags, "A",
 	    "Allocator configuration flags");
 	SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0,
 	    "Desired per-cpu cache size");
 	SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0,
 	    "Maximum allowed per-cpu cache size");
 
 	/*
 	 * keg if present.
 	 */
 	if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
 		domains = vm_ndomains;
 	else
 		domains = 1;
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
 	    "keg", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	keg = zone->uz_keg;
 	if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) {
 		SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "name", CTLFLAG_RD, keg->uk_name, "Keg name");
 		SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "rsize", CTLFLAG_RD, &keg->uk_rsize, 0,
 		    "Real object size with alignment");
 		SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "ppera", CTLFLAG_RD, &keg->uk_ppera, 0,
 		    "pages per-slab allocation");
 		SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "ipers", CTLFLAG_RD, &keg->uk_ipers, 0,
 		    "items available per-slab");
 		SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "align", CTLFLAG_RD, &keg->uk_align, 0,
 		    "item alignment mask");
 		SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "reserve", CTLFLAG_RD, &keg->uk_reserve, 0,
 		    "number of reserved items");
 		SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
 		    keg, 0, sysctl_handle_uma_slab_efficiency, "I",
 		    "Slab utilization (100 - internal fragmentation %)");
 		domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid),
 		    OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 		for (i = 0; i < domains; i++) {
 			dom = &keg->uk_domain[i];
 			oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
 			    OID_AUTO, VM_DOMAIN(i)->vmd_name,
 			    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 			SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 			    "pages", CTLFLAG_RD, &dom->ud_pages, 0,
 			    "Total pages currently allocated from VM");
 			SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 			    "free_items", CTLFLAG_RD, &dom->ud_free_items, 0,
 			    "Items free in the slab layer");
 			SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 			    "free_slabs", CTLFLAG_RD, &dom->ud_free_slabs, 0,
 			    "Unused slabs");
 		}
 	} else
 		SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "name", CTLFLAG_RD, nokeg, "Keg name");
 
 	/*
 	 * Information about zone limits.
 	 */
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
 	    "limit", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
 	    zone, 0, sysctl_handle_uma_zone_items, "QU",
 	    "Current number of allocated items if limit is set");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "max_items", CTLFLAG_RD, &zone->uz_max_items, 0,
 	    "Maximum number of allocated and cached items");
 	SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0,
 	    "Number of threads sleeping at limit");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0,
 	    "Total zone limit sleeps");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "bucket_max", CTLFLAG_RD, &zone->uz_bucket_max, 0,
 	    "Maximum number of items in each domain's bucket cache");
 
 	/*
 	 * Per-domain zone information.
 	 */
 	domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid),
 	    OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	for (i = 0; i < domains; i++) {
 		zdom = ZDOM_GET(zone, i);
 		oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
 		    OID_AUTO, VM_DOMAIN(i)->vmd_name,
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "nitems", CTLFLAG_RD, &zdom->uzd_nitems,
 		    "number of items in this domain");
 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "imax", CTLFLAG_RD, &zdom->uzd_imax,
 		    "maximum item count in this period");
 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "imin", CTLFLAG_RD, &zdom->uzd_imin,
 		    "minimum item count in this period");
 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "bimin", CTLFLAG_RD, &zdom->uzd_bimin,
 		    "Minimum item count in this batch");
 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "wss", CTLFLAG_RD, &zdom->uzd_wss,
 		    "Working set size");
 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "limin", CTLFLAG_RD, &zdom->uzd_limin,
 		    "Long time minimum item count");
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "timin", CTLFLAG_RD, &zdom->uzd_timin, 0,
 		    "Time since zero long time minimum item count");
 	}
 
 	/*
 	 * General statistics.
 	 */
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
 	    "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
 	    zone, 1, sysctl_handle_uma_zone_cur, "I",
 	    "Current number of allocated items");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
 	    zone, 0, sysctl_handle_uma_zone_allocs, "QU",
 	    "Total allocation calls");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
 	    zone, 0, sysctl_handle_uma_zone_frees, "QU",
 	    "Total free calls");
 	SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "fails", CTLFLAG_RD, &zone->uz_fails,
 	    "Number of allocation failures");
 	SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "xdomain", CTLFLAG_RD, &zone->uz_xdomain,
 	    "Free calls from the wrong domain");
 }
 
 struct uma_zone_count {
 	const char	*name;
 	int		count;
 };
 
 static void
 zone_count(uma_zone_t zone, void *arg)
 {
 	struct uma_zone_count *cnt;
 
 	cnt = arg;
 	/*
 	 * Some zones are rapidly created with identical names and
 	 * destroyed out of order.  This can lead to gaps in the count.
 	 * Use one greater than the maximum observed for this name.
 	 */
 	if (strcmp(zone->uz_name, cnt->name) == 0)
 		cnt->count = MAX(cnt->count,
 		    zone->uz_namecnt + 1);
 }
 
 static void
 zone_update_caches(uma_zone_t zone)
 {
 	int i;
 
 	for (i = 0; i <= mp_maxid; i++) {
 		cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size);
 		cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags);
 	}
 }
 
 /*
  * Zone header ctor.  This initializes all fields, locks, etc.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_zctor_args
  */
 static int
 zone_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_zone_count cnt;
 	struct uma_zctor_args *arg = udata;
 	uma_zone_domain_t zdom;
 	uma_zone_t zone = mem;
 	uma_zone_t z;
 	uma_keg_t keg;
 	int i;
 
 	bzero(zone, size);
 	zone->uz_name = arg->name;
 	zone->uz_ctor = arg->ctor;
 	zone->uz_dtor = arg->dtor;
 	zone->uz_init = NULL;
 	zone->uz_fini = NULL;
 	zone->uz_sleeps = 0;
 	zone->uz_bucket_size = 0;
 	zone->uz_bucket_size_min = 0;
 	zone->uz_bucket_size_max = BUCKET_MAX;
 	zone->uz_flags = (arg->flags & UMA_ZONE_SMR);
 	zone->uz_warning = NULL;
 	/* The domain structures follow the cpu structures. */
 	zone->uz_bucket_max = ULONG_MAX;
 	timevalclear(&zone->uz_ratecheck);
 
 	/* Count the number of duplicate names. */
 	cnt.name = arg->name;
 	cnt.count = 0;
 	zone_foreach(zone_count, &cnt);
 	zone->uz_namecnt = cnt.count;
 	ZONE_CROSS_LOCK_INIT(zone);
 
 	for (i = 0; i < vm_ndomains; i++) {
 		zdom = ZDOM_GET(zone, i);
 		ZDOM_LOCK_INIT(zone, zdom, (arg->flags & UMA_ZONE_MTXCLASS));
 		STAILQ_INIT(&zdom->uzd_buckets);
 	}
 
 #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN)
 	if (arg->uminit == trash_init && arg->fini == trash_fini)
 		zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR;
 #elif defined(KASAN)
 	if ((arg->flags & (UMA_ZONE_NOFREE | UMA_ZFLAG_CACHE)) != 0)
 		arg->flags |= UMA_ZONE_NOKASAN;
 #endif
 
 	/*
 	 * This is a pure cache zone, no kegs.
 	 */
 	if (arg->import) {
 		KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0,
 		    ("zone_ctor: Import specified for non-cache zone."));
 		zone->uz_flags = arg->flags;
 		zone->uz_size = arg->size;
 		zone->uz_import = arg->import;
 		zone->uz_release = arg->release;
 		zone->uz_arg = arg->arg;
 #ifdef NUMA
 		/*
 		 * Cache zones are round-robin unless a policy is
 		 * specified because they may have incompatible
 		 * constraints.
 		 */
 		if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0)
 			zone->uz_flags |= UMA_ZONE_ROUNDROBIN;
 #endif
 		rw_wlock(&uma_rwlock);
 		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
 		rw_wunlock(&uma_rwlock);
 		goto out;
 	}
 
 	/*
 	 * Use the regular zone/keg/slab allocator.
 	 */
 	zone->uz_import = zone_import;
 	zone->uz_release = zone_release;
 	zone->uz_arg = zone; 
 	keg = arg->keg;
 
 	if (arg->flags & UMA_ZONE_SECONDARY) {
 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
 		zone->uz_init = arg->uminit;
 		zone->uz_fini = arg->fini;
 		zone->uz_flags |= UMA_ZONE_SECONDARY;
 		rw_wlock(&uma_rwlock);
 		ZONE_LOCK(zone);
 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
 			if (LIST_NEXT(z, uz_link) == NULL) {
 				LIST_INSERT_AFTER(z, zone, uz_link);
 				break;
 			}
 		}
 		ZONE_UNLOCK(zone);
 		rw_wunlock(&uma_rwlock);
 	} else if (keg == NULL) {
 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
 		    arg->align, arg->flags)) == NULL)
 			return (ENOMEM);
 	} else {
 		struct uma_kctor_args karg;
 		int error;
 
 		/* We should only be here from uma_startup() */
 		karg.size = arg->size;
 		karg.uminit = arg->uminit;
 		karg.fini = arg->fini;
 		karg.align = arg->align;
 		karg.flags = (arg->flags & ~UMA_ZONE_SMR);
 		karg.zone = zone;
 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
 		    flags);
 		if (error)
 			return (error);
 	}
 
 	/* Inherit properties from the keg. */
 	zone->uz_keg = keg;
 	zone->uz_size = keg->uk_size;
 	zone->uz_flags |= (keg->uk_flags &
 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
 
 out:
 	if (booted >= BOOT_PCPU) {
 		zone_alloc_counters(zone, NULL);
 		if (booted >= BOOT_RUNNING)
 			zone_alloc_sysctl(zone, NULL);
 	} else {
 		zone->uz_allocs = EARLY_COUNTER;
 		zone->uz_frees = EARLY_COUNTER;
 		zone->uz_fails = EARLY_COUNTER;
 	}
 
 	/* Caller requests a private SMR context. */
 	if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
 		zone->uz_smr = smr_create(zone->uz_name, 0, 0);
 
 	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
 	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
 	    ("Invalid zone flag combination"));
 	if (arg->flags & UMA_ZFLAG_INTERNAL)
 		zone->uz_bucket_size_max = zone->uz_bucket_size = 0;
 	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
 		zone->uz_bucket_size = BUCKET_MAX;
 	else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
 		zone->uz_bucket_size = 0;
 	else
 		zone->uz_bucket_size = bucket_select(zone->uz_size);
 	zone->uz_bucket_size_min = zone->uz_bucket_size;
 	if (zone->uz_dtor != NULL || zone->uz_ctor != NULL)
 		zone->uz_flags |= UMA_ZFLAG_CTORDTOR;
 	zone_update_caches(zone);
 
 	return (0);
 }
 
 /*
  * Keg header dtor.  This frees all data, destroys locks, frees the hash
  * table and removes the keg from the global list.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 keg_dtor(void *arg, int size, void *udata)
 {
 	uma_keg_t keg;
 	uint32_t free, pages;
 	int i;
 
 	keg = (uma_keg_t)arg;
 	free = pages = 0;
 	for (i = 0; i < vm_ndomains; i++) {
 		free += keg->uk_domain[i].ud_free_items;
 		pages += keg->uk_domain[i].ud_pages;
 		KEG_LOCK_FINI(keg, i);
 	}
 	if (pages != 0)
 		printf("Freed UMA keg (%s) was not empty (%u items). "
 		    " Lost %u pages of memory.\n",
 		    keg->uk_name ? keg->uk_name : "",
 		    pages / keg->uk_ppera * keg->uk_ipers - free, pages);
 
 	hash_free(&keg->uk_hash);
 }
 
 /*
  * Zone header dtor.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 zone_dtor(void *arg, int size, void *udata)
 {
 	uma_zone_t zone;
 	uma_keg_t keg;
 	int i;
 
 	zone = (uma_zone_t)arg;
 
 	sysctl_remove_oid(zone->uz_oid, 1, 1);
 
 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
 		cache_drain(zone);
 
 	rw_wlock(&uma_rwlock);
 	LIST_REMOVE(zone, uz_link);
 	rw_wunlock(&uma_rwlock);
 	if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
 		keg = zone->uz_keg;
 		keg->uk_reserve = 0;
 	}
 	zone_reclaim(zone, UMA_ANYDOMAIN, M_WAITOK, true);
 
 	/*
 	 * We only destroy kegs from non secondary/non cache zones.
 	 */
 	if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
 		keg = zone->uz_keg;
 		rw_wlock(&uma_rwlock);
 		LIST_REMOVE(keg, uk_link);
 		rw_wunlock(&uma_rwlock);
 		zone_free_item(kegs, keg, NULL, SKIP_NONE);
 	}
 	counter_u64_free(zone->uz_allocs);
 	counter_u64_free(zone->uz_frees);
 	counter_u64_free(zone->uz_fails);
 	counter_u64_free(zone->uz_xdomain);
 	free(zone->uz_ctlname, M_UMA);
 	for (i = 0; i < vm_ndomains; i++)
 		ZDOM_LOCK_FINI(ZDOM_GET(zone, i));
 	ZONE_CROSS_LOCK_FINI(zone);
 }
 
 static void
 zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *arg), void *arg)
 {
 	uma_keg_t keg;
 	uma_zone_t zone;
 
 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
 			zfunc(zone, arg);
 	}
 	LIST_FOREACH(zone, &uma_cachezones, uz_link)
 		zfunc(zone, arg);
 }
 
 /*
  * Traverses every zone in the system and calls a callback
  *
  * Arguments:
  *	zfunc  A pointer to a function which accepts a zone
  *		as an argument.
  *
  * Returns:
  *	Nothing
  */
 static void
 zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg)
 {
 
 	rw_rlock(&uma_rwlock);
 	zone_foreach_unlocked(zfunc, arg);
 	rw_runlock(&uma_rwlock);
 }
 
 /*
  * Initialize the kernel memory allocator.  This is done after pages can be
  * allocated but before general KVA is available.
  */
 void
 uma_startup1(vm_offset_t virtual_avail)
 {
 	struct uma_zctor_args args;
 	size_t ksize, zsize, size;
 	uma_keg_t primarykeg;
 	uintptr_t m;
 	int domain;
 	uint8_t pflag;
 
 	bootstart = bootmem = virtual_avail;
 
 	rw_init(&uma_rwlock, "UMA lock");
 	sx_init(&uma_reclaim_lock, "umareclaim");
 
 	ksize = sizeof(struct uma_keg) +
 	    (sizeof(struct uma_domain) * vm_ndomains);
 	ksize = roundup(ksize, UMA_SUPER_ALIGN);
 	zsize = sizeof(struct uma_zone) +
 	    (sizeof(struct uma_cache) * (mp_maxid + 1)) +
 	    (sizeof(struct uma_zone_domain) * vm_ndomains);
 	zsize = roundup(zsize, UMA_SUPER_ALIGN);
 
 	/* Allocate the zone of zones, zone of kegs, and zone of zones keg. */
 	size = (zsize * 2) + ksize;
 	for (domain = 0; domain < vm_ndomains; domain++) {
 		m = (uintptr_t)startup_alloc(NULL, size, domain, &pflag,
 		    M_NOWAIT | M_ZERO);
 		if (m != 0)
 			break;
 	}
 	zones = (uma_zone_t)m;
 	m += zsize;
 	kegs = (uma_zone_t)m;
 	m += zsize;
 	primarykeg = (uma_keg_t)m;
 
 	/* "manually" create the initial zone */
 	memset(&args, 0, sizeof(args));
 	args.name = "UMA Kegs";
 	args.size = ksize;
 	args.ctor = keg_ctor;
 	args.dtor = keg_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = primarykeg;
 	args.align = UMA_SUPER_ALIGN - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	zone_ctor(kegs, zsize, &args, M_WAITOK);
 
 	args.name = "UMA Zones";
 	args.size = zsize;
 	args.ctor = zone_ctor;
 	args.dtor = zone_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = NULL;
 	args.align = UMA_SUPER_ALIGN - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	zone_ctor(zones, zsize, &args, M_WAITOK);
 
 	/* Now make zones for slab headers */
 	slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 	slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	hashzone = uma_zcreate("UMA Hash",
 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	bucket_init();
 	smr_init();
 }
 
 #ifndef UMA_MD_SMALL_ALLOC
 extern void vm_radix_reserve_kva(void);
 #endif
 
 /*
  * Advertise the availability of normal kva allocations and switch to
  * the default back-end allocator.  Marks the KVA we consumed on startup
  * as used in the map.
  */
 void
 uma_startup2(void)
 {
 
 	if (bootstart != bootmem) {
 		vm_map_lock(kernel_map);
 		(void)vm_map_insert(kernel_map, NULL, 0, bootstart, bootmem,
 		    VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
 		vm_map_unlock(kernel_map);
 	}
 
 #ifndef UMA_MD_SMALL_ALLOC
 	/* Set up radix zone to use noobj_alloc. */
 	vm_radix_reserve_kva();
 #endif
 
 	booted = BOOT_KVA;
 	zone_foreach_unlocked(zone_kva_available, NULL);
 	bucket_enable();
 }
 
 /*
  * Allocate counters as early as possible so that boot-time allocations are
  * accounted more precisely.
  */
 static void
 uma_startup_pcpu(void *arg __unused)
 {
 
 	zone_foreach_unlocked(zone_alloc_counters, NULL);
 	booted = BOOT_PCPU;
 }
 SYSINIT(uma_startup_pcpu, SI_SUB_COUNTER, SI_ORDER_ANY, uma_startup_pcpu, NULL);
 
 /*
  * Finish our initialization steps.
  */
 static void
 uma_startup3(void *arg __unused)
 {
 
 #ifdef INVARIANTS
 	TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
 	uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
 	uma_skip_cnt = counter_u64_alloc(M_WAITOK);
 #endif
 	zone_foreach_unlocked(zone_alloc_sysctl, NULL);
 	callout_init(&uma_callout, 1);
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 	booted = BOOT_RUNNING;
 
 	EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL,
 	    EVENTHANDLER_PRI_FIRST);
 }
 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 
 static void
 uma_shutdown(void)
 {
 
 	booted = BOOT_SHUTDOWN;
 }
 
 static uma_keg_t
 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
 		int align, uint32_t flags)
 {
 	struct uma_kctor_args args;
 
 	args.size = size;
 	args.uminit = uminit;
 	args.fini = fini;
 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
 	args.flags = flags;
 	args.zone = zone;
 	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 /* Public functions */
 /* See uma.h */
 void
 uma_set_align(int align)
 {
 
 	if (align != UMA_ALIGN_CACHE)
 		uma_align_cache = align;
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 		uma_init uminit, uma_fini fini, int align, uint32_t flags)
 
 {
 	struct uma_zctor_args args;
 	uma_zone_t res;
 
 	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
 	    align, name));
 
 	/* This stuff is essential for the zone ctor */
 	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = uminit;
 	args.fini = fini;
 #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN)
 	/*
 	 * Inject procedures which check for memory use after free if we are
 	 * allowed to scramble the memory while it is not allocated.  This
 	 * requires that: UMA is actually able to access the memory, no init
 	 * or fini procedures, no dependency on the initial value of the
 	 * memory, and no (legitimate) use of the memory after free.  Note,
 	 * the ctor and dtor do not need to be empty.
 	 */
 	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH |
 	    UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) {
 		args.uminit = trash_init;
 		args.fini = trash_fini;
 	}
 #endif
 	args.align = align;
 	args.flags = flags;
 	args.keg = NULL;
 
 	sx_xlock(&uma_reclaim_lock);
 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 	sx_xunlock(&uma_reclaim_lock);
 
 	return (res);
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor,
     uma_init zinit, uma_fini zfini, uma_zone_t primary)
 {
 	struct uma_zctor_args args;
 	uma_keg_t keg;
 	uma_zone_t res;
 
 	keg = primary->uz_keg;
 	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = keg->uk_size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = zinit;
 	args.fini = zfini;
 	args.align = keg->uk_align;
 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
 	args.keg = keg;
 
 	sx_xlock(&uma_reclaim_lock);
 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 	sx_xunlock(&uma_reclaim_lock);
 
 	return (res);
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor,
     uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease,
     void *arg, int flags)
 {
 	struct uma_zctor_args args;
 
 	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = zinit;
 	args.fini = zfini;
 	args.import = zimport;
 	args.release = zrelease;
 	args.arg = arg;
 	args.align = 0;
 	args.flags = flags | UMA_ZFLAG_CACHE;
 
 	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 /* See uma.h */
 void
 uma_zdestroy(uma_zone_t zone)
 {
 
 	/*
 	 * Large slabs are expensive to reclaim, so don't bother doing
 	 * unnecessary work if we're shutting down.
 	 */
 	if (booted == BOOT_SHUTDOWN &&
 	    zone->uz_fini == NULL && zone->uz_release == zone_release)
 		return;
 	sx_xlock(&uma_reclaim_lock);
 	zone_free_item(zones, zone, NULL, SKIP_NONE);
 	sx_xunlock(&uma_reclaim_lock);
 }
 
 void
 uma_zwait(uma_zone_t zone)
 {
 
 	if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
 		uma_zfree_smr(zone, uma_zalloc_smr(zone, M_WAITOK));
 	else if ((zone->uz_flags & UMA_ZONE_PCPU) != 0)
 		uma_zfree_pcpu(zone, uma_zalloc_pcpu(zone, M_WAITOK));
 	else
 		uma_zfree(zone, uma_zalloc(zone, M_WAITOK));
 }
 
 void *
 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
 {
 	void *item, *pcpu_item;
 #ifdef SMP
 	int i;
 
 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
 #endif
 	item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
 	if (item == NULL)
 		return (NULL);
 	pcpu_item = zpcpu_base_to_offset(item);
 	if (flags & M_ZERO) {
 #ifdef SMP
 		for (i = 0; i <= mp_maxid; i++)
 			bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size);
 #else
 		bzero(item, zone->uz_size);
 #endif
 	}
 	return (pcpu_item);
 }
 
 /*
  * A stub while both regular and pcpu cases are identical.
  */
 void
 uma_zfree_pcpu_arg(uma_zone_t zone, void *pcpu_item, void *udata)
 {
 	void *item;
 
 #ifdef SMP
 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
 #endif
 
         /* uma_zfree_pcu_*(..., NULL) does nothing, to match free(9). */
         if (pcpu_item == NULL)
                 return;
 
 	item = zpcpu_offset_to_base(pcpu_item);
 	uma_zfree_arg(zone, item, udata);
 }
 
 static inline void *
 item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags,
     void *item)
 {
 #ifdef INVARIANTS
 	bool skipdbg;
 #endif
 
 	kasan_mark_item_valid(zone, item);
 	kmsan_mark_item_uninitialized(zone, item);
 
 #ifdef INVARIANTS
 	skipdbg = uma_dbg_zskip(zone, item);
 	if (!skipdbg && (uz_flags & UMA_ZFLAG_TRASH) != 0 &&
 	    zone->uz_ctor != trash_ctor)
 		trash_ctor(item, size, udata, flags);
 #endif
 
 	/* Check flags before loading ctor pointer. */
 	if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) &&
 	    __predict_false(zone->uz_ctor != NULL) &&
 	    zone->uz_ctor(item, size, udata, flags) != 0) {
 		counter_u64_add(zone->uz_fails, 1);
 		zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
 		return (NULL);
 	}
 #ifdef INVARIANTS
 	if (!skipdbg)
 		uma_dbg_alloc(zone, NULL, item);
 #endif
 	if (__predict_false(flags & M_ZERO))
 		return (memset(item, 0, size));
 
 	return (item);
 }
 
 static inline void
 item_dtor(uma_zone_t zone, void *item, int size, void *udata,
     enum zfreeskip skip)
 {
 #ifdef INVARIANTS
 	bool skipdbg;
 
 	skipdbg = uma_dbg_zskip(zone, item);
 	if (skip == SKIP_NONE && !skipdbg) {
 		if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0)
 			uma_dbg_free(zone, udata, item);
 		else
 			uma_dbg_free(zone, NULL, item);
 	}
 #endif
 	if (__predict_true(skip < SKIP_DTOR)) {
 		if (zone->uz_dtor != NULL)
 			zone->uz_dtor(item, size, udata);
 #ifdef INVARIANTS
 		if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
 		    zone->uz_dtor != trash_dtor)
 			trash_dtor(item, size, udata);
 #endif
 	}
 	kasan_mark_item_invalid(zone, item);
 }
 
 #ifdef NUMA
 static int
 item_domain(void *item)
 {
 	int domain;
 
 	domain = vm_phys_domain(vtophys(item));
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("%s: unknown domain for item %p", __func__, item));
 	return (domain);
 }
 #endif
 
 #if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS)
 #define	UMA_ZALLOC_DEBUG
 static int
 uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags)
 {
 	int error;
 
 	error = 0;
 #ifdef WITNESS
 	if (flags & M_WAITOK) {
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_debug: zone \"%s\"", zone->uz_name);
 	}
 #endif
 
 #ifdef INVARIANTS
 	KASSERT((flags & M_EXEC) == 0,
 	    ("uma_zalloc_debug: called with M_EXEC"));
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zalloc_debug: called within spinlock or critical section"));
 	KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0,
 	    ("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO"));
 #endif
 
 #ifdef DEBUG_MEMGUARD
 	if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && memguard_cmp_zone(zone)) {
 		void *item;
 		item = memguard_alloc(zone->uz_size, flags);
 		if (item != NULL) {
 			error = EJUSTRETURN;
 			if (zone->uz_init != NULL &&
 			    zone->uz_init(item, zone->uz_size, flags) != 0) {
 				*itemp = NULL;
 				return (error);
 			}
 			if (zone->uz_ctor != NULL &&
 			    zone->uz_ctor(item, zone->uz_size, udata,
 			    flags) != 0) {
 				counter_u64_add(zone->uz_fails, 1);
 			    	zone->uz_fini(item, zone->uz_size);
 				*itemp = NULL;
 				return (error);
 			}
 			*itemp = item;
 			return (error);
 		}
 		/* This is unfortunate but should not be fatal. */
 	}
 #endif
 	return (error);
 }
 
 static int
 uma_zfree_debug(uma_zone_t zone, void *item, void *udata)
 {
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zfree_debug: called with spinlock or critical section held"));
 
 #ifdef DEBUG_MEMGUARD
 	if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && is_memguard_addr(item)) {
 		if (zone->uz_dtor != NULL)
 			zone->uz_dtor(item, zone->uz_size, udata);
 		if (zone->uz_fini != NULL)
 			zone->uz_fini(item, zone->uz_size);
 		memguard_free(item);
 		return (EJUSTRETURN);
 	}
 #endif
 	return (0);
 }
 #endif
 
 static inline void *
 cache_alloc_item(uma_zone_t zone, uma_cache_t cache, uma_cache_bucket_t bucket,
     void *udata, int flags)
 {
 	void *item;
 	int size, uz_flags;
 
 	item = cache_bucket_pop(cache, bucket);
 	size = cache_uz_size(cache);
 	uz_flags = cache_uz_flags(cache);
 	critical_exit();
 	return (item_ctor(zone, uz_flags, size, udata, flags, item));
 }
 
 static __noinline void *
 cache_alloc_retry(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
 {
 	uma_cache_bucket_t bucket;
 	int domain;
 
 	while (cache_alloc(zone, cache, udata, flags)) {
 		cache = &zone->uz_cpu[curcpu];
 		bucket = &cache->uc_allocbucket;
 		if (__predict_false(bucket->ucb_cnt == 0))
 			continue;
 		return (cache_alloc_item(zone, cache, bucket, udata, flags));
 	}
 	critical_exit();
 
 	/*
 	 * We can not get a bucket so try to return a single item.
 	 */
 	if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH)
 		domain = PCPU_GET(domain);
 	else
 		domain = UMA_ANYDOMAIN;
 	return (zone_alloc_item(zone, udata, domain, flags));
 }
 
 /* See uma.h */
 void *
 uma_zalloc_smr(uma_zone_t zone, int flags)
 {
 	uma_cache_bucket_t bucket;
 	uma_cache_t cache;
 
 #ifdef UMA_ZALLOC_DEBUG
 	void *item;
 
 	KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
 	    ("uma_zalloc_arg: called with non-SMR zone."));
 	if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN)
 		return (item);
 #endif
 
 	critical_enter();
 	cache = &zone->uz_cpu[curcpu];
 	bucket = &cache->uc_allocbucket;
 	if (__predict_false(bucket->ucb_cnt == 0))
 		return (cache_alloc_retry(zone, cache, NULL, flags));
 	return (cache_alloc_item(zone, cache, bucket, NULL, flags));
 }
 
 /* See uma.h */
 void *
 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 {
 	uma_cache_bucket_t bucket;
 	uma_cache_t cache;
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
 
 	/* This is the fast path allocation */
 	CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name,
 	    zone, flags);
 
 #ifdef UMA_ZALLOC_DEBUG
 	void *item;
 
 	KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
 	    ("uma_zalloc_arg: called with SMR zone."));
 	if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN)
 		return (item);
 #endif
 
 	/*
 	 * If possible, allocate from the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to allocate from
 	 * the current cache; when we re-acquire the critical section, we
 	 * must detect and handle migration if it has occurred.
 	 */
 	critical_enter();
 	cache = &zone->uz_cpu[curcpu];
 	bucket = &cache->uc_allocbucket;
 	if (__predict_false(bucket->ucb_cnt == 0))
 		return (cache_alloc_retry(zone, cache, udata, flags));
 	return (cache_alloc_item(zone, cache, bucket, udata, flags));
 }
 
 /*
  * Replenish an alloc bucket and possibly restore an old one.  Called in
  * a critical section.  Returns in a critical section.
  *
  * A false return value indicates an allocation failure.
  * A true return value indicates success and the caller should retry.
  */
 static __noinline bool
 cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
 {
 	uma_bucket_t bucket;
 	int curdomain, domain;
 	bool new;
 
 	CRITICAL_ASSERT(curthread);
 
 	/*
 	 * If we have run out of items in our alloc bucket see
 	 * if we can switch with the free bucket.
 	 *
 	 * SMR Zones can't re-use the free bucket until the sequence has
 	 * expired.
 	 */
 	if ((cache_uz_flags(cache) & UMA_ZONE_SMR) == 0 &&
 	    cache->uc_freebucket.ucb_cnt != 0) {
 		cache_bucket_swap(&cache->uc_freebucket,
 		    &cache->uc_allocbucket);
 		return (true);
 	}
 
 	/*
 	 * Discard any empty allocation bucket while we hold no locks.
 	 */
 	bucket = cache_bucket_unload_alloc(cache);
 	critical_exit();
 
 	if (bucket != NULL) {
 		KASSERT(bucket->ub_cnt == 0,
 		    ("cache_alloc: Entered with non-empty alloc bucket."));
 		bucket_free(zone, bucket, udata);
 	}
 
 	/*
 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
 	 * we must go back to the zone.  This requires the zdom lock, so we
 	 * must drop the critical section, then re-acquire it when we go back
 	 * to the cache.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	domain = PCPU_GET(domain);
 	if ((cache_uz_flags(cache) & UMA_ZONE_ROUNDROBIN) != 0 ||
 	    VM_DOMAIN_EMPTY(domain))
 		domain = zone_domain_highest(zone, domain);
 	bucket = cache_fetch_bucket(zone, cache, domain);
 	if (bucket == NULL && zone->uz_bucket_size != 0 && !bucketdisable) {
 		bucket = zone_alloc_bucket(zone, udata, domain, flags);
 		new = true;
 	} else {
 		new = false;
 	}
 
 	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
 	    zone->uz_name, zone, bucket);
 	if (bucket == NULL) {
 		critical_enter();
 		return (false);
 	}
 
 	/*
 	 * See if we lost the race or were migrated.  Cache the
 	 * initialized bucket to make this less likely or claim
 	 * the memory directly.
 	 */
 	critical_enter();
 	cache = &zone->uz_cpu[curcpu];
 	if (cache->uc_allocbucket.ucb_bucket == NULL &&
 	    ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) == 0 ||
 	    (curdomain = PCPU_GET(domain)) == domain ||
 	    VM_DOMAIN_EMPTY(curdomain))) {
 		if (new)
 			atomic_add_long(&ZDOM_GET(zone, domain)->uzd_imax,
 			    bucket->ub_cnt);
 		cache_bucket_load_alloc(cache, bucket);
 		return (true);
 	}
 
 	/*
 	 * We lost the race, release this bucket and start over.
 	 */
 	critical_exit();
 	zone_put_bucket(zone, domain, bucket, udata, !new);
 	critical_enter();
 
 	return (true);
 }
 
 void *
 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
 {
 #ifdef NUMA
 	uma_bucket_t bucket;
 	uma_zone_domain_t zdom;
 	void *item;
 #endif
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
 
 	/* This is the fast path allocation */
 	CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d",
 	    zone->uz_name, zone, domain, flags);
 
 	if (flags & M_WAITOK) {
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
 	}
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("uma_zalloc_domain: called with spinlock or critical section held"));
 	KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
 	    ("uma_zalloc_domain: called with SMR zone."));
 #ifdef NUMA
 	KASSERT((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0,
 	    ("uma_zalloc_domain: called with non-FIRSTTOUCH zone."));
 
 	if (vm_ndomains == 1)
 		return (uma_zalloc_arg(zone, udata, flags));
 
 	/*
 	 * Try to allocate from the bucket cache before falling back to the keg.
 	 * We could try harder and attempt to allocate from per-CPU caches or
 	 * the per-domain cross-domain buckets, but the complexity is probably
 	 * not worth it.  It is more important that frees of previous
 	 * cross-domain allocations do not blow up the cache.
 	 */
 	zdom = zone_domain_lock(zone, domain);
 	if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) {
 		item = bucket->ub_bucket[bucket->ub_cnt - 1];
 #ifdef INVARIANTS
 		bucket->ub_bucket[bucket->ub_cnt - 1] = NULL;
 #endif
 		bucket->ub_cnt--;
 		zone_put_bucket(zone, domain, bucket, udata, true);
 		item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata,
 		    flags, item);
 		if (item != NULL) {
 			KASSERT(item_domain(item) == domain,
 			    ("%s: bucket cache item %p from wrong domain",
 			    __func__, item));
 			counter_u64_add(zone->uz_allocs, 1);
 		}
 		return (item);
 	}
 	ZDOM_UNLOCK(zdom);
 	return (zone_alloc_item(zone, udata, domain, flags));
 #else
 	return (uma_zalloc_arg(zone, udata, flags));
 #endif
 }
 
 /*
  * Find a slab with some space.  Prefer slabs that are partially used over those
  * that are totally full.  This helps to reduce fragmentation.
  *
  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
  * only 'domain'.
  */
 static uma_slab_t
 keg_first_slab(uma_keg_t keg, int domain, bool rr)
 {
 	uma_domain_t dom;
 	uma_slab_t slab;
 	int start;
 
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("keg_first_slab: domain %d out of range", domain));
 	KEG_LOCK_ASSERT(keg, domain);
 
 	slab = NULL;
 	start = domain;
 	do {
 		dom = &keg->uk_domain[domain];
 		if ((slab = LIST_FIRST(&dom->ud_part_slab)) != NULL)
 			return (slab);
 		if ((slab = LIST_FIRST(&dom->ud_free_slab)) != NULL) {
 			LIST_REMOVE(slab, us_link);
 			dom->ud_free_slabs--;
 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 			return (slab);
 		}
 		if (rr)
 			domain = (domain + 1) % vm_ndomains;
 	} while (domain != start);
 
 	return (NULL);
 }
 
 /*
  * Fetch an existing slab from a free or partial list.  Returns with the
  * keg domain lock held if a slab was found or unlocked if not.
  */
 static uma_slab_t
 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
 {
 	uma_slab_t slab;
 	uint32_t reserve;
 
 	/* HASH has a single free list. */
 	if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0)
 		domain = 0;
 
 	KEG_LOCK(keg, domain);
 	reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
 	if (keg->uk_domain[domain].ud_free_items <= reserve ||
 	    (slab = keg_first_slab(keg, domain, rr)) == NULL) {
 		KEG_UNLOCK(keg, domain);
 		return (NULL);
 	}
 	return (slab);
 }
 
 static uma_slab_t
 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
 {
 	struct vm_domainset_iter di;
 	uma_slab_t slab;
 	int aflags, domain;
 	bool rr;
 
 restart:
 	/*
 	 * Use the keg's policy if upper layers haven't already specified a
 	 * domain (as happens with first-touch zones).
 	 *
 	 * To avoid races we run the iterator with the keg lock held, but that
 	 * means that we cannot allow the vm_domainset layer to sleep.  Thus,
 	 * clear M_WAITOK and handle low memory conditions locally.
 	 */
 	rr = rdomain == UMA_ANYDOMAIN;
 	if (rr) {
 		aflags = (flags & ~M_WAITOK) | M_NOWAIT;
 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
 		    &aflags);
 	} else {
 		aflags = flags;
 		domain = rdomain;
 	}
 
 	for (;;) {
 		slab = keg_fetch_free_slab(keg, domain, rr, flags);
 		if (slab != NULL)
 			return (slab);
 
 		/*
 		 * M_NOVM means don't ask at all!
 		 */
 		if (flags & M_NOVM)
 			break;
 
 		slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
 		if (slab != NULL)
 			return (slab);
 		if (!rr && (flags & M_WAITOK) == 0)
 			break;
 		if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
 			if ((flags & M_WAITOK) != 0) {
 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
 				goto restart;
 			}
 			break;
 		}
 	}
 
 	/*
 	 * We might not have been able to get a slab but another cpu
 	 * could have while we were unlocked.  Check again before we
 	 * fail.
 	 */
 	if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL)
 		return (slab);
 
 	return (NULL);
 }
 
 static void *
 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
 {
 	uma_domain_t dom;
 	void *item;
 	int freei;
 
 	KEG_LOCK_ASSERT(keg, slab->us_domain);
 
 	dom = &keg->uk_domain[slab->us_domain];
 	freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1;
 	BIT_CLR(keg->uk_ipers, freei, &slab->us_free);
 	item = slab_item(slab, keg, freei);
 	slab->us_freecount--;
 	dom->ud_free_items--;
 
 	/*
 	 * Move this slab to the full list.  It must be on the partial list, so
 	 * we do not need to update the free slab count.  In particular,
 	 * keg_fetch_slab() always returns slabs on the partial list.
 	 */
 	if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
 	}
 
 	return (item);
 }
 
 static int
 zone_import(void *arg, void **bucket, int max, int domain, int flags)
 {
 	uma_domain_t dom;
 	uma_zone_t zone;
 	uma_slab_t slab;
 	uma_keg_t keg;
 #ifdef NUMA
 	int stripe;
 #endif
 	int i;
 
 	zone = arg;
 	slab = NULL;
 	keg = zone->uz_keg;
 	/* Try to keep the buckets totally full */
 	for (i = 0; i < max; ) {
 		if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL)
 			break;
 #ifdef NUMA
 		stripe = howmany(max, vm_ndomains);
 #endif
 		dom = &keg->uk_domain[slab->us_domain];
 		do {
 			bucket[i++] = slab_alloc_item(keg, slab);
 			if (dom->ud_free_items <= keg->uk_reserve) {
 				/*
 				 * Avoid depleting the reserve after a
 				 * successful item allocation, even if
 				 * M_USE_RESERVE is specified.
 				 */
 				KEG_UNLOCK(keg, slab->us_domain);
 				goto out;
 			}
 #ifdef NUMA
 			/*
 			 * If the zone is striped we pick a new slab for every
 			 * N allocations.  Eliminating this conditional will
 			 * instead pick a new domain for each bucket rather
 			 * than stripe within each bucket.  The current option
 			 * produces more fragmentation and requires more cpu
 			 * time but yields better distribution.
 			 */
 			if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 &&
 			    vm_ndomains > 1 && --stripe == 0)
 				break;
 #endif
 		} while (slab->us_freecount != 0 && i < max);
 		KEG_UNLOCK(keg, slab->us_domain);
 
 		/* Don't block if we allocated any successfully. */
 		flags &= ~M_WAITOK;
 		flags |= M_NOWAIT;
 	}
 out:
 	return i;
 }
 
 static int
 zone_alloc_limit_hard(uma_zone_t zone, int count, int flags)
 {
 	uint64_t old, new, total, max;
 
 	/*
 	 * The hard case.  We're going to sleep because there were existing
 	 * sleepers or because we ran out of items.  This routine enforces
 	 * fairness by keeping fifo order.
 	 *
 	 * First release our ill gotten gains and make some noise.
 	 */
 	for (;;) {
 		zone_free_limit(zone, count);
 		zone_log_warning(zone);
 		zone_maxaction(zone);
 		if (flags & M_NOWAIT)
 			return (0);
 
 		/*
 		 * We need to allocate an item or set ourself as a sleeper
 		 * while the sleepq lock is held to avoid wakeup races.  This
 		 * is essentially a home rolled semaphore.
 		 */
 		sleepq_lock(&zone->uz_max_items);
 		old = zone->uz_items;
 		do {
 			MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX);
 			/* Cache the max since we will evaluate twice. */
 			max = zone->uz_max_items;
 			if (UZ_ITEMS_SLEEPERS(old) != 0 ||
 			    UZ_ITEMS_COUNT(old) >= max)
 				new = old + UZ_ITEMS_SLEEPER;
 			else
 				new = old + MIN(count, max - old);
 		} while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0);
 
 		/* We may have successfully allocated under the sleepq lock. */
 		if (UZ_ITEMS_SLEEPERS(new) == 0) {
 			sleepq_release(&zone->uz_max_items);
 			return (new - old);
 		}
 
 		/*
 		 * This is in a different cacheline from uz_items so that we
 		 * don't constantly invalidate the fastpath cacheline when we
 		 * adjust item counts.  This could be limited to toggling on
 		 * transitions.
 		 */
 		atomic_add_32(&zone->uz_sleepers, 1);
 		atomic_add_64(&zone->uz_sleeps, 1);
 
 		/*
 		 * We have added ourselves as a sleeper.  The sleepq lock
 		 * protects us from wakeup races.  Sleep now and then retry.
 		 */
 		sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0);
 		sleepq_wait(&zone->uz_max_items, PVM);
 
 		/*
 		 * After wakeup, remove ourselves as a sleeper and try
 		 * again.  We no longer have the sleepq lock for protection.
 		 *
 		 * Subract ourselves as a sleeper while attempting to add
 		 * our count.
 		 */
 		atomic_subtract_32(&zone->uz_sleepers, 1);
 		old = atomic_fetchadd_64(&zone->uz_items,
 		    -(UZ_ITEMS_SLEEPER - count));
 		/* We're no longer a sleeper. */
 		old -= UZ_ITEMS_SLEEPER;
 
 		/*
 		 * If we're still at the limit, restart.  Notably do not
 		 * block on other sleepers.  Cache the max value to protect
 		 * against changes via sysctl.
 		 */
 		total = UZ_ITEMS_COUNT(old);
 		max = zone->uz_max_items;
 		if (total >= max)
 			continue;
 		/* Truncate if necessary, otherwise wake other sleepers. */
 		if (total + count > max) {
 			zone_free_limit(zone, total + count - max);
 			count = max - total;
 		} else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0)
 			wakeup_one(&zone->uz_max_items);
 
 		return (count);
 	}
 }
 
 /*
  * Allocate 'count' items from our max_items limit.  Returns the number
  * available.  If M_NOWAIT is not specified it will sleep until at least
  * one item can be allocated.
  */
 static int
 zone_alloc_limit(uma_zone_t zone, int count, int flags)
 {
 	uint64_t old;
 	uint64_t max;
 
 	max = zone->uz_max_items;
 	MPASS(max > 0);
 
 	/*
 	 * We expect normal allocations to succeed with a simple
 	 * fetchadd.
 	 */
 	old = atomic_fetchadd_64(&zone->uz_items, count);
 	if (__predict_true(old + count <= max))
 		return (count);
 
 	/*
 	 * If we had some items and no sleepers just return the
 	 * truncated value.  We have to release the excess space
 	 * though because that may wake sleepers who weren't woken
 	 * because we were temporarily over the limit.
 	 */
 	if (old < max) {
 		zone_free_limit(zone, (old + count) - max);
 		return (max - old);
 	}
 	return (zone_alloc_limit_hard(zone, count, flags));
 }
 
 /*
  * Free a number of items back to the limit.
  */
 static void
 zone_free_limit(uma_zone_t zone, int count)
 {
 	uint64_t old;
 
 	MPASS(count > 0);
 
 	/*
 	 * In the common case we either have no sleepers or
 	 * are still over the limit and can just return.
 	 */
 	old = atomic_fetchadd_64(&zone->uz_items, -count);
 	if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 ||
 	   UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items))
 		return;
 
 	/*
 	 * Moderate the rate of wakeups.  Sleepers will continue
 	 * to generate wakeups if necessary.
 	 */
 	wakeup_one(&zone->uz_max_items);
 }
 
 static uma_bucket_t
 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
 {
 	uma_bucket_t bucket;
 	int error, maxbucket, cnt;
 
 	CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name,
 	    zone, domain);
 
 	/* Avoid allocs targeting empty domains. */
 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
 		domain = UMA_ANYDOMAIN;
 	else if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0)
 		domain = UMA_ANYDOMAIN;
 
 	if (zone->uz_max_items > 0)
 		maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size,
 		    M_NOWAIT);
 	else
 		maxbucket = zone->uz_bucket_size;
 	if (maxbucket == 0)
 		return (false);
 
 	/* Don't wait for buckets, preserve caller's NOVM setting. */
 	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
 	if (bucket == NULL) {
 		cnt = 0;
 		goto out;
 	}
 
 	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
 	    MIN(maxbucket, bucket->ub_entries), domain, flags);
 
 	/*
 	 * Initialize the memory if necessary.
 	 */
 	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
 		int i;
 
 		for (i = 0; i < bucket->ub_cnt; i++) {
 			kasan_mark_item_valid(zone, bucket->ub_bucket[i]);
 			error = zone->uz_init(bucket->ub_bucket[i],
 			    zone->uz_size, flags);
 			kasan_mark_item_invalid(zone, bucket->ub_bucket[i]);
 			if (error != 0)
 				break;
 		}
 
 		/*
 		 * If we couldn't initialize the whole bucket, put the
 		 * rest back onto the freelist.
 		 */
 		if (i != bucket->ub_cnt) {
 			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
 			    bucket->ub_cnt - i);
 #ifdef INVARIANTS
 			bzero(&bucket->ub_bucket[i],
 			    sizeof(void *) * (bucket->ub_cnt - i));
 #endif
 			bucket->ub_cnt = i;
 		}
 	}
 
 	cnt = bucket->ub_cnt;
 	if (bucket->ub_cnt == 0) {
 		bucket_free(zone, bucket, udata);
 		counter_u64_add(zone->uz_fails, 1);
 		bucket = NULL;
 	}
 out:
 	if (zone->uz_max_items > 0 && cnt < maxbucket)
 		zone_free_limit(zone, maxbucket - cnt);
 
 	return (bucket);
 }
 
 /*
  * Allocates a single item from a zone.
  *
  * Arguments
  *	zone   The zone to alloc for.
  *	udata  The data to be passed to the constructor.
  *	domain The domain to allocate from or UMA_ANYDOMAIN.
  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
  *
  * Returns
  *	NULL if there is no memory and M_NOWAIT is set
  *	An item if successful
  */
 
 static void *
 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
 {
 	void *item;
 
 	if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) {
 		counter_u64_add(zone->uz_fails, 1);
 		return (NULL);
 	}
 
 	/* Avoid allocs targeting empty domains. */
 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
 		domain = UMA_ANYDOMAIN;
 
 	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
 		goto fail_cnt;
 
 	/*
 	 * We have to call both the zone's init (not the keg's init)
 	 * and the zone's ctor.  This is because the item is going from
 	 * a keg slab directly to the user, and the user is expecting it
 	 * to be both zone-init'd as well as zone-ctor'd.
 	 */
 	if (zone->uz_init != NULL) {
 		int error;
 
 		kasan_mark_item_valid(zone, item);
 		error = zone->uz_init(item, zone->uz_size, flags);
 		kasan_mark_item_invalid(zone, item);
 		if (error != 0) {
 			zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
 			goto fail_cnt;
 		}
 	}
 	item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags,
 	    item);
 	if (item == NULL)
 		goto fail;
 
 	counter_u64_add(zone->uz_allocs, 1);
 	CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
 	    zone->uz_name, zone);
 
 	return (item);
 
 fail_cnt:
 	counter_u64_add(zone->uz_fails, 1);
 fail:
 	if (zone->uz_max_items > 0)
 		zone_free_limit(zone, 1);
 	CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
 	    zone->uz_name, zone);
 
 	return (NULL);
 }
 
 /* See uma.h */
 void
 uma_zfree_smr(uma_zone_t zone, void *item)
 {
 	uma_cache_t cache;
 	uma_cache_bucket_t bucket;
 	int itemdomain, uz_flags;
 
 #ifdef UMA_ZALLOC_DEBUG
 	KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
 	    ("uma_zfree_smr: called with non-SMR zone."));
 	KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer."));
 	SMR_ASSERT_NOT_ENTERED(zone->uz_smr);
 	if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN)
 		return;
 #endif
 	cache = &zone->uz_cpu[curcpu];
 	uz_flags = cache_uz_flags(cache);
 	itemdomain = 0;
 #ifdef NUMA
 	if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
 		itemdomain = item_domain(item);
 #endif
 	critical_enter();
 	do {
 		cache = &zone->uz_cpu[curcpu];
 		/* SMR Zones must free to the free bucket. */
 		bucket = &cache->uc_freebucket;
 #ifdef NUMA
 		if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
 		    PCPU_GET(domain) != itemdomain) {
 			bucket = &cache->uc_crossbucket;
 		}
 #endif
 		if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
 			cache_bucket_push(cache, bucket, item);
 			critical_exit();
 			return;
 		}
 	} while (cache_free(zone, cache, NULL, item, itemdomain));
 	critical_exit();
 
 	/*
 	 * If nothing else caught this, we'll just do an internal free.
 	 */
 	zone_free_item(zone, item, NULL, SKIP_NONE);
 }
 
 /* See uma.h */
 void
 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 {
 	uma_cache_t cache;
 	uma_cache_bucket_t bucket;
 	int itemdomain, uz_flags;
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
 
 	CTR2(KTR_UMA, "uma_zfree_arg zone %s(%p)", zone->uz_name, zone);
 
 #ifdef UMA_ZALLOC_DEBUG
 	KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
 	    ("uma_zfree_arg: called with SMR zone."));
 	if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN)
 		return;
 #endif
         /* uma_zfree(..., NULL) does nothing, to match free(9). */
         if (item == NULL)
                 return;
 
 	/*
 	 * We are accessing the per-cpu cache without a critical section to
 	 * fetch size and flags.  This is acceptable, if we are preempted we
 	 * will simply read another cpu's line.
 	 */
 	cache = &zone->uz_cpu[curcpu];
 	uz_flags = cache_uz_flags(cache);
 	if (UMA_ALWAYS_CTORDTOR ||
 	    __predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0))
 		item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE);
 
 	/*
 	 * The race here is acceptable.  If we miss it we'll just have to wait
 	 * a little longer for the limits to be reset.
 	 */
 	if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) {
 		if (atomic_load_32(&zone->uz_sleepers) > 0)
 			goto zfree_item;
 	}
 
 	/*
 	 * If possible, free to the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to free to the
 	 * current cache; when we re-acquire the critical section, we must
 	 * detect and handle migration if it has occurred.
 	 */
 	itemdomain = 0;
 #ifdef NUMA
 	if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
 		itemdomain = item_domain(item);
 #endif
 	critical_enter();
 	do {
 		cache = &zone->uz_cpu[curcpu];
 		/*
 		 * Try to free into the allocbucket first to give LIFO
 		 * ordering for cache-hot datastructures.  Spill over
 		 * into the freebucket if necessary.  Alloc will swap
 		 * them if one runs dry.
 		 */
 		bucket = &cache->uc_allocbucket;
 #ifdef NUMA
 		if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
 		    PCPU_GET(domain) != itemdomain) {
 			bucket = &cache->uc_crossbucket;
 		} else
 #endif
 		if (bucket->ucb_cnt == bucket->ucb_entries &&
 		   cache->uc_freebucket.ucb_cnt <
 		   cache->uc_freebucket.ucb_entries)
 			cache_bucket_swap(&cache->uc_freebucket,
 			    &cache->uc_allocbucket);
 		if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
 			cache_bucket_push(cache, bucket, item);
 			critical_exit();
 			return;
 		}
 	} while (cache_free(zone, cache, udata, item, itemdomain));
 	critical_exit();
 
 	/*
 	 * If nothing else caught this, we'll just do an internal free.
 	 */
 zfree_item:
 	zone_free_item(zone, item, udata, SKIP_DTOR);
 }
 
 #ifdef NUMA
 /*
  * sort crossdomain free buckets to domain correct buckets and cache
  * them.
  */
 static void
 zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata)
 {
 	struct uma_bucketlist emptybuckets, fullbuckets;
 	uma_zone_domain_t zdom;
 	uma_bucket_t b;
 	smr_seq_t seq;
 	void *item;
 	int domain;
 
 	CTR3(KTR_UMA,
 	    "uma_zfree: zone %s(%p) draining cross bucket %p",
 	    zone->uz_name, zone, bucket);
 
 	/*
 	 * It is possible for buckets to arrive here out of order so we fetch
 	 * the current smr seq rather than accepting the bucket's.
 	 */
 	seq = SMR_SEQ_INVALID;
 	if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
 		seq = smr_advance(zone->uz_smr);
 
 	/*
 	 * To avoid having ndomain * ndomain buckets for sorting we have a
 	 * lock on the current crossfree bucket.  A full matrix with
 	 * per-domain locking could be used if necessary.
 	 */
 	STAILQ_INIT(&emptybuckets);
 	STAILQ_INIT(&fullbuckets);
 	ZONE_CROSS_LOCK(zone);
 	for (; bucket->ub_cnt > 0; bucket->ub_cnt--) {
 		item = bucket->ub_bucket[bucket->ub_cnt - 1];
 		domain = item_domain(item);
 		zdom = ZDOM_GET(zone, domain);
 		if (zdom->uzd_cross == NULL) {
 			if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) {
 				STAILQ_REMOVE_HEAD(&emptybuckets, ub_link);
 				zdom->uzd_cross = b;
 			} else {
 				/*
 				 * Avoid allocating a bucket with the cross lock
 				 * held, since allocation can trigger a
 				 * cross-domain free and bucket zones may
 				 * allocate from each other.
 				 */
 				ZONE_CROSS_UNLOCK(zone);
 				b = bucket_alloc(zone, udata, M_NOWAIT);
 				if (b == NULL)
 					goto out;
 				ZONE_CROSS_LOCK(zone);
 				if (zdom->uzd_cross != NULL) {
 					STAILQ_INSERT_HEAD(&emptybuckets, b,
 					    ub_link);
 				} else {
 					zdom->uzd_cross = b;
 				}
 			}
 		}
 		b = zdom->uzd_cross;
 		b->ub_bucket[b->ub_cnt++] = item;
 		b->ub_seq = seq;
 		if (b->ub_cnt == b->ub_entries) {
 			STAILQ_INSERT_HEAD(&fullbuckets, b, ub_link);
 			if ((b = STAILQ_FIRST(&emptybuckets)) != NULL)
 				STAILQ_REMOVE_HEAD(&emptybuckets, ub_link);
 			zdom->uzd_cross = b;
 		}
 	}
 	ZONE_CROSS_UNLOCK(zone);
 out:
 	if (bucket->ub_cnt == 0)
 		bucket->ub_seq = SMR_SEQ_INVALID;
 	bucket_free(zone, bucket, udata);
 
 	while ((b = STAILQ_FIRST(&emptybuckets)) != NULL) {
 		STAILQ_REMOVE_HEAD(&emptybuckets, ub_link);
 		bucket_free(zone, b, udata);
 	}
 	while ((b = STAILQ_FIRST(&fullbuckets)) != NULL) {
 		STAILQ_REMOVE_HEAD(&fullbuckets, ub_link);
 		domain = item_domain(b->ub_bucket[0]);
 		zone_put_bucket(zone, domain, b, udata, true);
 	}
 }
 #endif
 
 static void
 zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
     int itemdomain, bool ws)
 {
 
 #ifdef NUMA
 	/*
 	 * Buckets coming from the wrong domain will be entirely for the
 	 * only other domain on two domain systems.  In this case we can
 	 * simply cache them.  Otherwise we need to sort them back to
 	 * correct domains.
 	 */
 	if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
 	    vm_ndomains > 2 && PCPU_GET(domain) != itemdomain) {
 		zone_free_cross(zone, bucket, udata);
 		return;
 	}
 #endif
 
 	/*
 	 * Attempt to save the bucket in the zone's domain bucket cache.
 	 */
 	CTR3(KTR_UMA,
 	    "uma_zfree: zone %s(%p) putting bucket %p on free list",
 	    zone->uz_name, zone, bucket);
 	/* ub_cnt is pointing to the last free item */
 	if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0)
 		itemdomain = zone_domain_lowest(zone, itemdomain);
 	zone_put_bucket(zone, itemdomain, bucket, udata, ws);
 }
 
 /*
  * Populate a free or cross bucket for the current cpu cache.  Free any
  * existing full bucket either to the zone cache or back to the slab layer.
  *
  * Enters and returns in a critical section.  false return indicates that
  * we can not satisfy this free in the cache layer.  true indicates that
  * the caller should retry.
  */
 static __noinline bool
 cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item,
     int itemdomain)
 {
 	uma_cache_bucket_t cbucket;
 	uma_bucket_t newbucket, bucket;
 
 	CRITICAL_ASSERT(curthread);
 
 	if (zone->uz_bucket_size == 0)
 		return false;
 
 	cache = &zone->uz_cpu[curcpu];
 	newbucket = NULL;
 
 	/*
 	 * FIRSTTOUCH domains need to free to the correct zdom.  When
 	 * enabled this is the zdom of the item.   The bucket is the
 	 * cross bucket if the current domain and itemdomain do not match.
 	 */
 	cbucket = &cache->uc_freebucket;
 #ifdef NUMA
 	if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) {
 		if (PCPU_GET(domain) != itemdomain) {
 			cbucket = &cache->uc_crossbucket;
 			if (cbucket->ucb_cnt != 0)
 				counter_u64_add(zone->uz_xdomain,
 				    cbucket->ucb_cnt);
 		}
 	}
 #endif
 	bucket = cache_bucket_unload(cbucket);
 	KASSERT(bucket == NULL || bucket->ub_cnt == bucket->ub_entries,
 	    ("cache_free: Entered with non-full free bucket."));
 
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
 	/*
 	 * Don't let SMR zones operate without a free bucket.  Force
 	 * a synchronize and re-use this one.  We will only degrade
 	 * to a synchronize every bucket_size items rather than every
 	 * item if we fail to allocate a bucket.
 	 */
 	if ((zone->uz_flags & UMA_ZONE_SMR) != 0) {
 		if (bucket != NULL)
 			bucket->ub_seq = smr_advance(zone->uz_smr);
 		newbucket = bucket_alloc(zone, udata, M_NOWAIT);
 		if (newbucket == NULL && bucket != NULL) {
 			bucket_drain(zone, bucket);
 			newbucket = bucket;
 			bucket = NULL;
 		}
 	} else if (!bucketdisable)
 		newbucket = bucket_alloc(zone, udata, M_NOWAIT);
 
 	if (bucket != NULL)
 		zone_free_bucket(zone, bucket, udata, itemdomain, true);
 
 	critical_enter();
 	if ((bucket = newbucket) == NULL)
 		return (false);
 	cache = &zone->uz_cpu[curcpu];
 #ifdef NUMA
 	/*
 	 * Check to see if we should be populating the cross bucket.  If it
 	 * is already populated we will fall through and attempt to populate
 	 * the free bucket.
 	 */
 	if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) {
 		if (PCPU_GET(domain) != itemdomain &&
 		    cache->uc_crossbucket.ucb_bucket == NULL) {
 			cache_bucket_load_cross(cache, bucket);
 			return (true);
 		}
 	}
 #endif
 	/*
 	 * We may have lost the race to fill the bucket or switched CPUs.
 	 */
 	if (cache->uc_freebucket.ucb_bucket != NULL) {
 		critical_exit();
 		bucket_free(zone, bucket, udata);
 		critical_enter();
 	} else
 		cache_bucket_load_free(cache, bucket);
 
 	return (true);
 }
 
 static void
 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
 {
 	uma_keg_t keg;
 	uma_domain_t dom;
 	int freei;
 
 	keg = zone->uz_keg;
 	KEG_LOCK_ASSERT(keg, slab->us_domain);
 
 	/* Do we need to remove from any lists? */
 	dom = &keg->uk_domain[slab->us_domain];
 	if (slab->us_freecount + 1 == keg->uk_ipers) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
 		dom->ud_free_slabs++;
 	} else if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 	}
 
 	/* Slab management. */
 	freei = slab_item_index(slab, keg, item);
 	BIT_SET(keg->uk_ipers, freei, &slab->us_free);
 	slab->us_freecount++;
 
 	/* Keg statistics. */
 	dom->ud_free_items++;
 }
 
 static void
 zone_release(void *arg, void **bucket, int cnt)
 {
 	struct mtx *lock;
 	uma_zone_t zone;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	uint8_t *mem;
 	void *item;
 	int i;
 
 	zone = arg;
 	keg = zone->uz_keg;
 	lock = NULL;
 	if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0))
 		lock = KEG_LOCK(keg, 0);
 	for (i = 0; i < cnt; i++) {
 		item = bucket[i];
 		if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) {
 			slab = vtoslab((vm_offset_t)item);
 		} else {
 			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
 			if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0)
 				slab = hash_sfind(&keg->uk_hash, mem);
 			else
 				slab = (uma_slab_t)(mem + keg->uk_pgoff);
 		}
 		if (lock != KEG_LOCKPTR(keg, slab->us_domain)) {
 			if (lock != NULL)
 				mtx_unlock(lock);
 			lock = KEG_LOCK(keg, slab->us_domain);
 		}
 		slab_free_item(zone, slab, item);
 	}
 	if (lock != NULL)
 		mtx_unlock(lock);
 }
 
 /*
  * Frees a single item to any zone.
  *
  * Arguments:
  *	zone   The zone to free to
  *	item   The item we're freeing
  *	udata  User supplied data for the dtor
  *	skip   Skip dtors and finis
  */
 static __noinline void
 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
 {
 
 	/*
 	 * If a free is sent directly to an SMR zone we have to
 	 * synchronize immediately because the item can instantly
 	 * be reallocated. This should only happen in degenerate
 	 * cases when no memory is available for per-cpu caches.
 	 */
 	if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE)
 		smr_synchronize(zone->uz_smr);
 
 	item_dtor(zone, item, zone->uz_size, udata, skip);
 
 	if (skip < SKIP_FINI && zone->uz_fini) {
 		kasan_mark_item_valid(zone, item);
 		zone->uz_fini(item, zone->uz_size);
 		kasan_mark_item_invalid(zone, item);
 	}
 
 	zone->uz_release(zone->uz_arg, &item, 1);
 
 	if (skip & SKIP_CNT)
 		return;
 
 	counter_u64_add(zone->uz_frees, 1);
 
 	if (zone->uz_max_items > 0)
 		zone_free_limit(zone, 1);
 }
 
 /* See uma.h */
 int
 uma_zone_set_max(uma_zone_t zone, int nitems)
 {
 
 	/*
 	 * If the limit is small, we may need to constrain the maximum per-CPU
 	 * cache size, or disable caching entirely.
 	 */
 	uma_zone_set_maxcache(zone, nitems);
 
 	/*
 	 * XXX This can misbehave if the zone has any allocations with
 	 * no limit and a limit is imposed.  There is currently no
 	 * way to clear a limit.
 	 */
 	ZONE_LOCK(zone);
 	zone->uz_max_items = nitems;
 	zone->uz_flags |= UMA_ZFLAG_LIMIT;
 	zone_update_caches(zone);
 	/* We may need to wake waiters. */
 	wakeup(&zone->uz_max_items);
 	ZONE_UNLOCK(zone);
 
 	return (nitems);
 }
 
 /* See uma.h */
 void
 uma_zone_set_maxcache(uma_zone_t zone, int nitems)
 {
 	int bpcpu, bpdom, bsize, nb;
 
 	ZONE_LOCK(zone);
 
 	/*
 	 * Compute a lower bound on the number of items that may be cached in
 	 * the zone.  Each CPU gets at least two buckets, and for cross-domain
 	 * frees we use an additional bucket per CPU and per domain.  Select the
 	 * largest bucket size that does not exceed half of the requested limit,
 	 * with the left over space given to the full bucket cache.
 	 */
 	bpdom = 0;
 	bpcpu = 2;
 #ifdef NUMA
 	if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 1) {
 		bpcpu++;
 		bpdom++;
 	}
 #endif
 	nb = bpcpu * mp_ncpus + bpdom * vm_ndomains;
 	bsize = nitems / nb / 2;
 	if (bsize > BUCKET_MAX)
 		bsize = BUCKET_MAX;
 	else if (bsize == 0 && nitems / nb > 0)
 		bsize = 1;
 	zone->uz_bucket_size_max = zone->uz_bucket_size = bsize;
 	if (zone->uz_bucket_size_min > zone->uz_bucket_size_max)
 		zone->uz_bucket_size_min = zone->uz_bucket_size_max;
 	zone->uz_bucket_max = nitems - nb * bsize;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 int
 uma_zone_get_max(uma_zone_t zone)
 {
 	int nitems;
 
 	nitems = atomic_load_64(&zone->uz_max_items);
 
 	return (nitems);
 }
 
 /* See uma.h */
 void
 uma_zone_set_warning(uma_zone_t zone, const char *warning)
 {
 
 	ZONE_ASSERT_COLD(zone);
 	zone->uz_warning = warning;
 }
 
 /* See uma.h */
 void
 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
 {
 
 	ZONE_ASSERT_COLD(zone);
 	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
 }
 
 /* See uma.h */
 int
 uma_zone_get_cur(uma_zone_t zone)
 {
 	int64_t nitems;
 	u_int i;
 
 	nitems = 0;
 	if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER)
 		nitems = counter_u64_fetch(zone->uz_allocs) -
 		    counter_u64_fetch(zone->uz_frees);
 	CPU_FOREACH(i)
 		nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) -
 		    atomic_load_64(&zone->uz_cpu[i].uc_frees);
 
 	return (nitems < 0 ? 0 : nitems);
 }
 
 static uint64_t
 uma_zone_get_allocs(uma_zone_t zone)
 {
 	uint64_t nitems;
 	u_int i;
 
 	nitems = 0;
 	if (zone->uz_allocs != EARLY_COUNTER)
 		nitems = counter_u64_fetch(zone->uz_allocs);
 	CPU_FOREACH(i)
 		nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs);
 
 	return (nitems);
 }
 
 static uint64_t
 uma_zone_get_frees(uma_zone_t zone)
 {
 	uint64_t nitems;
 	u_int i;
 
 	nitems = 0;
 	if (zone->uz_frees != EARLY_COUNTER)
 		nitems = counter_u64_fetch(zone->uz_frees);
 	CPU_FOREACH(i)
 		nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees);
 
 	return (nitems);
 }
 
 #ifdef INVARIANTS
 /* Used only for KEG_ASSERT_COLD(). */
 static uint64_t
 uma_keg_get_allocs(uma_keg_t keg)
 {
 	uma_zone_t z;
 	uint64_t nitems;
 
 	nitems = 0;
 	LIST_FOREACH(z, &keg->uk_zones, uz_link)
 		nitems += uma_zone_get_allocs(z);
 
 	return (nitems);
 }
 #endif
 
 /* See uma.h */
 void
 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
 {
 	uma_keg_t keg;
 
 	KEG_GET(zone, keg);
 	KEG_ASSERT_COLD(keg);
 	keg->uk_init = uminit;
 }
 
 /* See uma.h */
 void
 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
 {
 	uma_keg_t keg;
 
 	KEG_GET(zone, keg);
 	KEG_ASSERT_COLD(keg);
 	keg->uk_fini = fini;
 }
 
 /* See uma.h */
 void
 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
 {
 
 	ZONE_ASSERT_COLD(zone);
 	zone->uz_init = zinit;
 }
 
 /* See uma.h */
 void
 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
 {
 
 	ZONE_ASSERT_COLD(zone);
 	zone->uz_fini = zfini;
 }
 
 /* See uma.h */
 void
 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
 {
 	uma_keg_t keg;
 
 	KEG_GET(zone, keg);
 	KEG_ASSERT_COLD(keg);
 	keg->uk_freef = freef;
 }
 
 /* See uma.h */
 void
 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 {
 	uma_keg_t keg;
 
 	KEG_GET(zone, keg);
 	KEG_ASSERT_COLD(keg);
 	keg->uk_allocf = allocf;
 }
 
 /* See uma.h */
 void
 uma_zone_set_smr(uma_zone_t zone, smr_t smr)
 {
 
 	ZONE_ASSERT_COLD(zone);
 
 	KASSERT(smr != NULL, ("Got NULL smr"));
 	KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
 	    ("zone %p (%s) already uses SMR", zone, zone->uz_name));
 	zone->uz_flags |= UMA_ZONE_SMR;
 	zone->uz_smr = smr;
 	zone_update_caches(zone);
 }
 
 smr_t
 uma_zone_get_smr(uma_zone_t zone)
 {
 
 	return (zone->uz_smr);
 }
 
 /* See uma.h */
 void
 uma_zone_reserve(uma_zone_t zone, int items)
 {
 	uma_keg_t keg;
 
 	KEG_GET(zone, keg);
 	KEG_ASSERT_COLD(keg);
 	keg->uk_reserve = items;
 }
 
 /* See uma.h */
 int
 uma_zone_reserve_kva(uma_zone_t zone, int count)
 {
 	uma_keg_t keg;
 	vm_offset_t kva;
 	u_int pages;
 
 	KEG_GET(zone, keg);
 	KEG_ASSERT_COLD(keg);
 	ZONE_ASSERT_COLD(zone);
 
 	pages = howmany(count, keg->uk_ipers) * keg->uk_ppera;
 
 #ifdef UMA_MD_SMALL_ALLOC
 	if (keg->uk_ppera > 1) {
 #else
 	if (1) {
 #endif
 		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
 		if (kva == 0)
 			return (0);
 	} else
 		kva = 0;
 
 	MPASS(keg->uk_kva == 0);
 	keg->uk_kva = kva;
 	keg->uk_offset = 0;
 	zone->uz_max_items = pages * keg->uk_ipers;
 #ifdef UMA_MD_SMALL_ALLOC
 	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
 #else
 	keg->uk_allocf = noobj_alloc;
 #endif
 	keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
 	zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
 	zone_update_caches(zone);
 
 	return (1);
 }
 
 /* See uma.h */
 void
 uma_prealloc(uma_zone_t zone, int items)
 {
 	struct vm_domainset_iter di;
 	uma_domain_t dom;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	int aflags, domain, slabs;
 
 	KEG_GET(zone, keg);
 	slabs = howmany(items, keg->uk_ipers);
 	while (slabs-- > 0) {
 		aflags = M_NOWAIT;
 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
 		    &aflags);
 		for (;;) {
 			slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
 			    aflags);
 			if (slab != NULL) {
 				dom = &keg->uk_domain[slab->us_domain];
 				/*
 				 * keg_alloc_slab() always returns a slab on the
 				 * partial list.
 				 */
 				LIST_REMOVE(slab, us_link);
 				LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
 				    us_link);
 				dom->ud_free_slabs++;
 				KEG_UNLOCK(keg, slab->us_domain);
 				break;
 			}
 			if (vm_domainset_iter_policy(&di, &domain) != 0)
 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
 		}
 	}
 }
 
 /*
  * Returns a snapshot of memory consumption in bytes.
  */
 size_t
 uma_zone_memory(uma_zone_t zone)
 {
 	size_t sz;
 	int i;
 
 	sz = 0;
 	if (zone->uz_flags & UMA_ZFLAG_CACHE) {
 		for (i = 0; i < vm_ndomains; i++)
 			sz += ZDOM_GET(zone, i)->uzd_nitems;
 		return (sz * zone->uz_size);
 	}
 	for (i = 0; i < vm_ndomains; i++)
 		sz += zone->uz_keg->uk_domain[i].ud_pages;
 
 	return (sz * PAGE_SIZE);
 }
 
 /* See uma.h */
 void
 uma_reclaim(int req)
 {
 	uma_reclaim_domain(req, UMA_ANYDOMAIN);
 }
 
 void
 uma_reclaim_domain(int req, int domain)
 {
 	void *arg;
 
 	bucket_enable();
 
 	arg = (void *)(uintptr_t)domain;
 	sx_slock(&uma_reclaim_lock);
 	switch (req) {
 	case UMA_RECLAIM_TRIM:
 		zone_foreach(zone_trim, arg);
 		break;
 	case UMA_RECLAIM_DRAIN:
 		zone_foreach(zone_drain, arg);
 		break;
 	case UMA_RECLAIM_DRAIN_CPU:
 		zone_foreach(zone_drain, arg);
 		pcpu_cache_drain_safe(NULL);
 		zone_foreach(zone_drain, arg);
 		break;
 	default:
 		panic("unhandled reclamation request %d", req);
 	}
 
 	/*
 	 * Some slabs may have been freed but this zone will be visited early
 	 * we visit again so that we can free pages that are empty once other
 	 * zones are drained.  We have to do the same for buckets.
 	 */
 	zone_drain(slabzones[0], arg);
 	zone_drain(slabzones[1], arg);
 	bucket_zone_drain(domain);
 	sx_sunlock(&uma_reclaim_lock);
 }
 
 static volatile int uma_reclaim_needed;
 
 void
 uma_reclaim_wakeup(void)
 {
 
 	if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
 		wakeup(uma_reclaim);
 }
 
 void
 uma_reclaim_worker(void *arg __unused)
 {
 
 	for (;;) {
 		sx_xlock(&uma_reclaim_lock);
 		while (atomic_load_int(&uma_reclaim_needed) == 0)
 			sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl",
 			    hz);
 		sx_xunlock(&uma_reclaim_lock);
 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
 		uma_reclaim(UMA_RECLAIM_DRAIN_CPU);
 		atomic_store_int(&uma_reclaim_needed, 0);
 		/* Don't fire more than once per-second. */
 		pause("umarclslp", hz);
 	}
 }
 
 /* See uma.h */
 void
 uma_zone_reclaim(uma_zone_t zone, int req)
 {
 	uma_zone_reclaim_domain(zone, req, UMA_ANYDOMAIN);
 }
 
 void
 uma_zone_reclaim_domain(uma_zone_t zone, int req, int domain)
 {
 	void *arg;
 
 	arg = (void *)(uintptr_t)domain;
 	switch (req) {
 	case UMA_RECLAIM_TRIM:
 		zone_trim(zone, arg);
 		break;
 	case UMA_RECLAIM_DRAIN:
 		zone_drain(zone, arg);
 		break;
 	case UMA_RECLAIM_DRAIN_CPU:
 		pcpu_cache_drain_safe(zone);
 		zone_drain(zone, arg);
 		break;
 	default:
 		panic("unhandled reclamation request %d", req);
 	}
 }
 
 /* See uma.h */
 int
 uma_zone_exhausted(uma_zone_t zone)
 {
 
 	return (atomic_load_32(&zone->uz_sleepers) > 0);
 }
 
 unsigned long
 uma_limit(void)
 {
 
 	return (uma_kmem_limit);
 }
 
 void
 uma_set_limit(unsigned long limit)
 {
 
 	uma_kmem_limit = limit;
 }
 
 unsigned long
 uma_size(void)
 {
 
 	return (atomic_load_long(&uma_kmem_total));
 }
 
 long
 uma_avail(void)
 {
 
 	return (uma_kmem_limit - uma_size());
 }
 
 #ifdef DDB
 /*
  * Generate statistics across both the zone and its per-cpu cache's.  Return
  * desired statistics if the pointer is non-NULL for that statistic.
  *
  * Note: does not update the zone statistics, as it can't safely clear the
  * per-CPU cache statistic.
  *
  */
 static void
 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
     uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
 {
 	uma_cache_t cache;
 	uint64_t allocs, frees, sleeps, xdomain;
 	int cachefree, cpu;
 
 	allocs = frees = sleeps = xdomain = 0;
 	cachefree = 0;
 	CPU_FOREACH(cpu) {
 		cache = &z->uz_cpu[cpu];
 		cachefree += cache->uc_allocbucket.ucb_cnt;
 		cachefree += cache->uc_freebucket.ucb_cnt;
 		xdomain += cache->uc_crossbucket.ucb_cnt;
 		cachefree += cache->uc_crossbucket.ucb_cnt;
 		allocs += cache->uc_allocs;
 		frees += cache->uc_frees;
 	}
 	allocs += counter_u64_fetch(z->uz_allocs);
 	frees += counter_u64_fetch(z->uz_frees);
 	xdomain += counter_u64_fetch(z->uz_xdomain);
 	sleeps += z->uz_sleeps;
 	if (cachefreep != NULL)
 		*cachefreep = cachefree;
 	if (allocsp != NULL)
 		*allocsp = allocs;
 	if (freesp != NULL)
 		*freesp = frees;
 	if (sleepsp != NULL)
 		*sleepsp = sleeps;
 	if (xdomainp != NULL)
 		*xdomainp = xdomain;
 }
 #endif /* DDB */
 
 static int
 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
 {
 	uma_keg_t kz;
 	uma_zone_t z;
 	int count;
 
 	count = 0;
 	rw_rlock(&uma_rwlock);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 	LIST_FOREACH(z, &uma_cachezones, uz_link)
 		count++;
 
 	rw_runlock(&uma_rwlock);
 	return (sysctl_handle_int(oidp, &count, 0, req));
 }
 
 static void
 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
     struct uma_percpu_stat *ups, bool internal)
 {
 	uma_zone_domain_t zdom;
 	uma_cache_t cache;
 	int i;
 
 	for (i = 0; i < vm_ndomains; i++) {
 		zdom = ZDOM_GET(z, i);
 		uth->uth_zone_free += zdom->uzd_nitems;
 	}
 	uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
 	uth->uth_frees = counter_u64_fetch(z->uz_frees);
 	uth->uth_fails = counter_u64_fetch(z->uz_fails);
 	uth->uth_xdomain = counter_u64_fetch(z->uz_xdomain);
 	uth->uth_sleeps = z->uz_sleeps;
 
 	for (i = 0; i < mp_maxid + 1; i++) {
 		bzero(&ups[i], sizeof(*ups));
 		if (internal || CPU_ABSENT(i))
 			continue;
 		cache = &z->uz_cpu[i];
 		ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt;
 		ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt;
 		ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt;
 		ups[i].ups_allocs = cache->uc_allocs;
 		ups[i].ups_frees = cache->uc_frees;
 	}
 }
 
 static int
 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct uma_stream_header ush;
 	struct uma_type_header uth;
 	struct uma_percpu_stat *ups;
 	struct sbuf sbuf;
 	uma_keg_t kz;
 	uma_zone_t z;
 	uint64_t items;
 	uint32_t kfree, pages;
 	int count, error, i;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
 	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
 
 	count = 0;
 	rw_rlock(&uma_rwlock);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 
 	LIST_FOREACH(z, &uma_cachezones, uz_link)
 		count++;
 
 	/*
 	 * Insert stream header.
 	 */
 	bzero(&ush, sizeof(ush));
 	ush.ush_version = UMA_STREAM_VERSION;
 	ush.ush_maxcpus = (mp_maxid + 1);
 	ush.ush_count = count;
 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
 
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		kfree = pages = 0;
 		for (i = 0; i < vm_ndomains; i++) {
 			kfree += kz->uk_domain[i].ud_free_items;
 			pages += kz->uk_domain[i].ud_pages;
 		}
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			bzero(&uth, sizeof(uth));
 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
 			uth.uth_align = kz->uk_align;
 			uth.uth_size = kz->uk_size;
 			uth.uth_rsize = kz->uk_rsize;
 			if (z->uz_max_items > 0) {
 				items = UZ_ITEMS_COUNT(z->uz_items);
 				uth.uth_pages = (items / kz->uk_ipers) *
 					kz->uk_ppera;
 			} else
 				uth.uth_pages = pages;
 			uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
 			    kz->uk_ppera;
 			uth.uth_limit = z->uz_max_items;
 			uth.uth_keg_free = kfree;
 
 			/*
 			 * A zone is secondary is it is not the first entry
 			 * on the keg's zone list.
 			 */
 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z))
 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
 			uma_vm_zone_stats(&uth, z, &sbuf, ups,
 			    kz->uk_flags & UMA_ZFLAG_INTERNAL);
 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 			for (i = 0; i < mp_maxid + 1; i++)
 				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
 		}
 	}
 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
 		bzero(&uth, sizeof(uth));
 		strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
 		uth.uth_size = z->uz_size;
 		uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
 		(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 		for (i = 0; i < mp_maxid + 1; i++)
 			(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
 	}
 
 	rw_runlock(&uma_rwlock);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	free(ups, M_TEMP);
 	return (error);
 }
 
 int
 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone = *(uma_zone_t *)arg1;
 	int error, max;
 
 	max = uma_zone_get_max(zone);
 	error = sysctl_handle_int(oidp, &max, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	uma_zone_set_max(zone, max);
 
 	return (0);
 }
 
 int
 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone;
 	int cur;
 
 	/*
 	 * Some callers want to add sysctls for global zones that
 	 * may not yet exist so they pass a pointer to a pointer.
 	 */
 	if (arg2 == 0)
 		zone = *(uma_zone_t *)arg1;
 	else
 		zone = arg1;
 	cur = uma_zone_get_cur(zone);
 	return (sysctl_handle_int(oidp, &cur, 0, req));
 }
 
 static int
 sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone = arg1;
 	uint64_t cur;
 
 	cur = uma_zone_get_allocs(zone);
 	return (sysctl_handle_64(oidp, &cur, 0, req));
 }
 
 static int
 sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone = arg1;
 	uint64_t cur;
 
 	cur = uma_zone_get_frees(zone);
 	return (sysctl_handle_64(oidp, &cur, 0, req));
 }
 
 static int
 sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	uma_zone_t zone = arg1;
 	int error;
 
 	sbuf_new_for_sysctl(&sbuf, NULL, 0, req);
 	if (zone->uz_flags != 0)
 		sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS);
 	else
 		sbuf_printf(&sbuf, "0");
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 
 	return (error);
 }
 
 static int
 sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS)
 {
 	uma_keg_t keg = arg1;
 	int avail, effpct, total;
 
 	total = keg->uk_ppera * PAGE_SIZE;
 	if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0)
 		total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize;
 	/*
 	 * We consider the client's requested size and alignment here, not the
 	 * real size determination uk_rsize, because we also adjust the real
 	 * size for internal implementation reasons (max bitset size).
 	 */
 	avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1);
 	if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
 		avail *= mp_maxid + 1;
 	effpct = 100 * avail / total;
 	return (sysctl_handle_int(oidp, &effpct, 0, req));
 }
 
 static int
 sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone = arg1;
 	uint64_t cur;
 
 	cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items));
 	return (sysctl_handle_64(oidp, &cur, 0, req));
 }
 
 #ifdef INVARIANTS
 static uma_slab_t
 uma_dbg_getslab(uma_zone_t zone, void *item)
 {
 	uma_slab_t slab;
 	uma_keg_t keg;
 	uint8_t *mem;
 
 	/*
 	 * It is safe to return the slab here even though the
 	 * zone is unlocked because the item's allocation state
 	 * essentially holds a reference.
 	 */
 	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
 	if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
 		return (NULL);
 	if (zone->uz_flags & UMA_ZFLAG_VTOSLAB)
 		return (vtoslab((vm_offset_t)mem));
 	keg = zone->uz_keg;
 	if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0)
 		return ((uma_slab_t)(mem + keg->uk_pgoff));
 	KEG_LOCK(keg, 0);
 	slab = hash_sfind(&keg->uk_hash, mem);
 	KEG_UNLOCK(keg, 0);
 
 	return (slab);
 }
 
 static bool
 uma_dbg_zskip(uma_zone_t zone, void *mem)
 {
 
 	if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
 		return (true);
 
 	return (uma_dbg_kskip(zone->uz_keg, mem));
 }
 
 static bool
 uma_dbg_kskip(uma_keg_t keg, void *mem)
 {
 	uintptr_t idx;
 
 	if (dbg_divisor == 0)
 		return (true);
 
 	if (dbg_divisor == 1)
 		return (false);
 
 	idx = (uintptr_t)mem >> PAGE_SHIFT;
 	if (keg->uk_ipers > 1) {
 		idx *= keg->uk_ipers;
 		idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
 	}
 
 	if ((idx / dbg_divisor) * dbg_divisor != idx) {
 		counter_u64_add(uma_skip_cnt, 1);
 		return (true);
 	}
 	counter_u64_add(uma_dbg_cnt, 1);
 
 	return (false);
 }
 
 /*
  * Set up the slab's freei data such that uma_dbg_free can function.
  *
  */
 static void
 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 {
 	uma_keg_t keg;
 	int freei;
 
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
 			panic("uma: item %p did not belong to zone %s",
 			    item, zone->uz_name);
 	}
 	keg = zone->uz_keg;
 	freei = slab_item_index(slab, keg, item);
 
 	if (BIT_TEST_SET_ATOMIC(keg->uk_ipers, freei,
 	    slab_dbg_bits(slab, keg)))
 		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)",
 		    item, zone, zone->uz_name, slab, freei);
 }
 
 /*
  * Verifies freed addresses.  Checks for alignment, valid slab membership
  * and duplicate frees.
  *
  */
 static void
 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 {
 	uma_keg_t keg;
 	int freei;
 
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
 			panic("uma: Freed item %p did not belong to zone %s",
 			    item, zone->uz_name);
 	}
 	keg = zone->uz_keg;
 	freei = slab_item_index(slab, keg, item);
 
 	if (freei >= keg->uk_ipers)
 		panic("Invalid free of %p from zone %p(%s) slab %p(%d)",
 		    item, zone, zone->uz_name, slab, freei);
 
 	if (slab_item(slab, keg, freei) != item)
 		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)",
 		    item, zone, zone->uz_name, slab, freei);
 
 	if (!BIT_TEST_CLR_ATOMIC(keg->uk_ipers, freei,
 	    slab_dbg_bits(slab, keg)))
 		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)",
 		    item, zone, zone->uz_name, slab, freei);
 }
 #endif /* INVARIANTS */
 
 #ifdef DDB
 static int64_t
 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used,
     uint64_t *sleeps, long *cachefree, uint64_t *xdomain)
 {
 	uint64_t frees;
 	int i;
 
 	if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
 		*allocs = counter_u64_fetch(z->uz_allocs);
 		frees = counter_u64_fetch(z->uz_frees);
 		*sleeps = z->uz_sleeps;
 		*cachefree = 0;
 		*xdomain = 0;
 	} else
 		uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps,
 		    xdomain);
 	for (i = 0; i < vm_ndomains; i++) {
 		*cachefree += ZDOM_GET(z, i)->uzd_nitems;
 		if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
 		    (LIST_FIRST(&kz->uk_zones) != z)))
 			*cachefree += kz->uk_domain[i].ud_free_items;
 	}
 	*used = *allocs - frees;
 	return (((int64_t)*used + *cachefree) * kz->uk_size);
 }
 
 DB_SHOW_COMMAND(uma, db_show_uma)
 {
 	const char *fmt_hdr, *fmt_entry;
 	uma_keg_t kz;
 	uma_zone_t z;
 	uint64_t allocs, used, sleeps, xdomain;
 	long cachefree;
 	/* variables for sorting */
 	uma_keg_t cur_keg;
 	uma_zone_t cur_zone, last_zone;
 	int64_t cur_size, last_size, size;
 	int ties;
 
 	/* /i option produces machine-parseable CSV output */
 	if (modif[0] == 'i') {
 		fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n";
 		fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n";
 	} else {
 		fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n";
 		fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n";
 	}
 
 	db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests",
 	    "Sleeps", "Bucket", "Total Mem", "XFree");
 
 	/* Sort the zones with largest size first. */
 	last_zone = NULL;
 	last_size = INT64_MAX;
 	for (;;) {
 		cur_zone = NULL;
 		cur_size = -1;
 		ties = 0;
 		LIST_FOREACH(kz, &uma_kegs, uk_link) {
 			LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 				/*
 				 * In the case of size ties, print out zones
 				 * in the order they are encountered.  That is,
 				 * when we encounter the most recently output
 				 * zone, we have already printed all preceding
 				 * ties, and we must print all following ties.
 				 */
 				if (z == last_zone) {
 					ties = 1;
 					continue;
 				}
 				size = get_uma_stats(kz, z, &allocs, &used,
 				    &sleeps, &cachefree, &xdomain);
 				if (size > cur_size && size < last_size + ties)
 				{
 					cur_size = size;
 					cur_zone = z;
 					cur_keg = kz;
 				}
 			}
 		}
 		if (cur_zone == NULL)
 			break;
 
 		size = get_uma_stats(cur_keg, cur_zone, &allocs, &used,
 		    &sleeps, &cachefree, &xdomain);
 		db_printf(fmt_entry, cur_zone->uz_name,
 		    (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree,
 		    (uintmax_t)allocs, (uintmax_t)sleeps,
 		    (unsigned)cur_zone->uz_bucket_size, (intmax_t)size,
 		    xdomain);
 
 		if (db_pager_quit)
 			return;
 		last_zone = cur_zone;
 		last_size = cur_size;
 	}
 }
 
 DB_SHOW_COMMAND(umacache, db_show_umacache)
 {
 	uma_zone_t z;
 	uint64_t allocs, frees;
 	long cachefree;
 	int i;
 
 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
 	    "Requests", "Bucket");
 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
 		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
 		for (i = 0; i < vm_ndomains; i++)
 			cachefree += ZDOM_GET(z, i)->uzd_nitems;
 		db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
 		    z->uz_name, (uintmax_t)z->uz_size,
 		    (intmax_t)(allocs - frees), cachefree,
 		    (uintmax_t)allocs, z->uz_bucket_size);
 		if (db_pager_quit)
 			return;
 	}
 }
 #endif	/* DDB */
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 6927d7af4409..0644c1167984 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -1,5618 +1,5618 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
  */
 
 /*-
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Resident memory management module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
 #include <sys/domainset.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sleepqueue.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 #include <vm/vm_domainset.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_dumpset.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
 #include <machine/md_var.h>
 
 struct vm_domain vm_dom[MAXMEMDOM];
 
 DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]);
 
 struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
 
 struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
 /* The following fields are protected by the domainset lock. */
 domainset_t __exclusive_cache_line vm_min_domains;
 domainset_t __exclusive_cache_line vm_severe_domains;
 static int vm_min_waiters;
 static int vm_severe_waiters;
 static int vm_pageproc_waiters;
 
 static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "VM page statistics");
 
 static COUNTER_U64_DEFINE_EARLY(pqstate_commit_retries);
 SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, pqstate_commit_retries,
     CTLFLAG_RD, &pqstate_commit_retries,
     "Number of failed per-page atomic queue state updates");
 
 static COUNTER_U64_DEFINE_EARLY(queue_ops);
 SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops,
     CTLFLAG_RD, &queue_ops,
     "Number of batched queue operations");
 
 static COUNTER_U64_DEFINE_EARLY(queue_nops);
 SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops,
     CTLFLAG_RD, &queue_nops,
     "Number of batched queue operations with no effects");
 
 /*
  * bogus page -- for I/O to/from partially complete buffers,
  * or for paging into sparsely invalid regions.
  */
 vm_page_t bogus_page;
 
 vm_page_t vm_page_array;
 long vm_page_array_size;
 long first_page;
 
 struct bitset *vm_page_dump;
 long vm_page_dump_pages;
 
 static TAILQ_HEAD(, vm_page) blacklist_head;
 static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
 
 static uma_zone_t fakepg_zone;
 
 static void vm_page_alloc_check(vm_page_t m);
 static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m,
     vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
 static void vm_page_enqueue(vm_page_t m, uint8_t queue);
 static bool vm_page_free_prep(vm_page_t m);
 static void vm_page_free_toq(vm_page_t m);
 static void vm_page_init(void *dummy);
 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
     vm_pindex_t pindex, vm_page_t mpred);
 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
     vm_page_t mpred);
 static void vm_page_mvqueue(vm_page_t m, const uint8_t queue,
     const uint16_t nflag);
 static int vm_page_reclaim_run(int req_class, int domain, u_long npages,
     vm_page_t m_run, vm_paddr_t high);
 static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse);
 static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object,
     int req);
 static int vm_page_zone_import(void *arg, void **store, int cnt, int domain,
     int flags);
 static void vm_page_zone_release(void *arg, void **store, int cnt);
 
 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL);
 
 static void
 vm_page_init(void *dummy)
 {
 
 	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	bogus_page = vm_page_alloc_noobj(VM_ALLOC_WIRED);
 }
 
 /*
  * The cache page zone is initialized later since we need to be able to allocate
  * pages before UMA is fully initialized.
  */
 static void
 vm_page_init_cache_zones(void *dummy __unused)
 {
 	struct vm_domain *vmd;
 	struct vm_pgcache *pgcache;
 	int cache, domain, maxcache, pool;
 
 	maxcache = 0;
 	TUNABLE_INT_FETCH("vm.pgcache_zone_max_pcpu", &maxcache);
 	maxcache *= mp_ncpus;
 	for (domain = 0; domain < vm_ndomains; domain++) {
 		vmd = VM_DOMAIN(domain);
 		for (pool = 0; pool < VM_NFREEPOOL; pool++) {
 			pgcache = &vmd->vmd_pgcache[pool];
 			pgcache->domain = domain;
 			pgcache->pool = pool;
 			pgcache->zone = uma_zcache_create("vm pgcache",
 			    PAGE_SIZE, NULL, NULL, NULL, NULL,
 			    vm_page_zone_import, vm_page_zone_release, pgcache,
 			    UMA_ZONE_VM);
 
 			/*
 			 * Limit each pool's zone to 0.1% of the pages in the
 			 * domain.
 			 */
 			cache = maxcache != 0 ? maxcache :
 			    vmd->vmd_page_count / 1000;
 			uma_zone_set_maxcache(pgcache->zone, cache);
 		}
 	}
 }
 SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL);
 
 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
 #if PAGE_SIZE == 32768
 #ifdef CTASSERT
 CTASSERT(sizeof(u_long) >= 8);
 #endif
 #endif
 
 /*
  *	vm_set_page_size:
  *
  *	Sets the page size, perhaps based upon the memory
  *	size.  Must be called before any use of page-size
  *	dependent functions.
  */
 void
 vm_set_page_size(void)
 {
 	if (vm_cnt.v_page_size == 0)
 		vm_cnt.v_page_size = PAGE_SIZE;
 	if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
 		panic("vm_set_page_size: page size not a power of two");
 }
 
 /*
  *	vm_page_blacklist_next:
  *
  *	Find the next entry in the provided string of blacklist
  *	addresses.  Entries are separated by space, comma, or newline.
  *	If an invalid integer is encountered then the rest of the
  *	string is skipped.  Updates the list pointer to the next
  *	character, or NULL if the string is exhausted or invalid.
  */
 static vm_paddr_t
 vm_page_blacklist_next(char **list, char *end)
 {
 	vm_paddr_t bad;
 	char *cp, *pos;
 
 	if (list == NULL || *list == NULL)
 		return (0);
 	if (**list =='\0') {
 		*list = NULL;
 		return (0);
 	}
 
 	/*
 	 * If there's no end pointer then the buffer is coming from
 	 * the kenv and we know it's null-terminated.
 	 */
 	if (end == NULL)
 		end = *list + strlen(*list);
 
 	/* Ensure that strtoq() won't walk off the end */
 	if (*end != '\0') {
 		if (*end == '\n' || *end == ' ' || *end  == ',')
 			*end = '\0';
 		else {
 			printf("Blacklist not terminated, skipping\n");
 			*list = NULL;
 			return (0);
 		}
 	}
 
 	for (pos = *list; *pos != '\0'; pos = cp) {
 		bad = strtoq(pos, &cp, 0);
 		if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
 			if (bad == 0) {
 				if (++cp < end)
 					continue;
 				else
 					break;
 			}
 		} else
 			break;
 		if (*cp == '\0' || ++cp >= end)
 			*list = NULL;
 		else
 			*list = cp;
 		return (trunc_page(bad));
 	}
 	printf("Garbage in RAM blacklist, skipping\n");
 	*list = NULL;
 	return (0);
 }
 
 bool
 vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
 {
 	struct vm_domain *vmd;
 	vm_page_t m;
 	int ret;
 
 	m = vm_phys_paddr_to_vm_page(pa);
 	if (m == NULL)
 		return (true); /* page does not exist, no failure */
 
 	vmd = vm_pagequeue_domain(m);
 	vm_domain_free_lock(vmd);
 	ret = vm_phys_unfree_page(m);
 	vm_domain_free_unlock(vmd);
 	if (ret != 0) {
 		vm_domain_freecnt_inc(vmd, -1);
 		TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
 		if (verbose)
 			printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa);
 	}
 	return (ret);
 }
 
 /*
  *	vm_page_blacklist_check:
  *
  *	Iterate through the provided string of blacklist addresses, pulling
  *	each entry out of the physical allocator free list and putting it
  *	onto a list for reporting via the vm.page_blacklist sysctl.
  */
 static void
 vm_page_blacklist_check(char *list, char *end)
 {
 	vm_paddr_t pa;
 	char *next;
 
 	next = list;
 	while (next != NULL) {
 		if ((pa = vm_page_blacklist_next(&next, end)) == 0)
 			continue;
 		vm_page_blacklist_add(pa, bootverbose);
 	}
 }
 
 /*
  *	vm_page_blacklist_load:
  *
  *	Search for a special module named "ram_blacklist".  It'll be a
  *	plain text file provided by the user via the loader directive
  *	of the same name.
  */
 static void
 vm_page_blacklist_load(char **list, char **end)
 {
 	void *mod;
 	u_char *ptr;
 	u_int len;
 
 	mod = NULL;
 	ptr = NULL;
 
 	mod = preload_search_by_type("ram_blacklist");
 	if (mod != NULL) {
 		ptr = preload_fetch_addr(mod);
 		len = preload_fetch_size(mod);
         }
 	*list = ptr;
 	if (ptr != NULL)
 		*end = ptr + len;
 	else
 		*end = NULL;
 	return;
 }
 
 static int
 sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
 {
 	vm_page_t m;
 	struct sbuf sbuf;
 	int error, first;
 
 	first = 1;
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	TAILQ_FOREACH(m, &blacklist_head, listq) {
 		sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
 		    (uintmax_t)m->phys_addr);
 		first = 0;
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 /*
  * Initialize a dummy page for use in scans of the specified paging queue.
  * In principle, this function only needs to set the flag PG_MARKER.
  * Nonetheless, it write busies the page as a safety precaution.
  */
 void
 vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags)
 {
 
 	bzero(marker, sizeof(*marker));
 	marker->flags = PG_MARKER;
 	marker->a.flags = aflags;
 	marker->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
 	marker->a.queue = queue;
 }
 
 static void
 vm_page_domain_init(int domain)
 {
 	struct vm_domain *vmd;
 	struct vm_pagequeue *pq;
 	int i;
 
 	vmd = VM_DOMAIN(domain);
 	bzero(vmd, sizeof(*vmd));
 	*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
 	    "vm inactive pagequeue";
 	*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
 	    "vm active pagequeue";
 	*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
 	    "vm laundry pagequeue";
 	*__DECONST(const char **,
 	    &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) =
 	    "vm unswappable pagequeue";
 	vmd->vmd_domain = domain;
 	vmd->vmd_page_count = 0;
 	vmd->vmd_free_count = 0;
 	vmd->vmd_segs = 0;
 	vmd->vmd_oom = FALSE;
 	for (i = 0; i < PQ_COUNT; i++) {
 		pq = &vmd->vmd_pagequeues[i];
 		TAILQ_INIT(&pq->pq_pl);
 		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
 		    MTX_DEF | MTX_DUPOK);
 		pq->pq_pdpages = 0;
 		vm_page_init_marker(&vmd->vmd_markers[i], i, 0);
 	}
 	mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
 	mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF);
 	snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
 
 	/*
 	 * inacthead is used to provide FIFO ordering for LRU-bypassing
 	 * insertions.
 	 */
 	vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED);
 	TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl,
 	    &vmd->vmd_inacthead, plinks.q);
 
 	/*
 	 * The clock pages are used to implement active queue scanning without
 	 * requeues.  Scans start at clock[0], which is advanced after the scan
 	 * ends.  When the two clock hands meet, they are reset and scanning
 	 * resumes from the head of the queue.
 	 */
 	vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED);
 	vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED);
 	TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
 	    &vmd->vmd_clock[0], plinks.q);
 	TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
 	    &vmd->vmd_clock[1], plinks.q);
 }
 
 /*
  * Initialize a physical page in preparation for adding it to the free
  * lists.
  */
 void
 vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind)
 {
 
 	m->object = NULL;
 	m->ref_count = 0;
 	m->busy_lock = VPB_FREED;
 	m->flags = m->a.flags = 0;
 	m->phys_addr = pa;
 	m->a.queue = PQ_NONE;
 	m->psind = 0;
 	m->segind = segind;
 	m->order = VM_NFREEORDER;
 	m->pool = VM_FREEPOOL_DEFAULT;
 	m->valid = m->dirty = 0;
 	pmap_page_init(m);
 }
 
 #ifndef PMAP_HAS_PAGE_ARRAY
 static vm_paddr_t
 vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range)
 {
 	vm_paddr_t new_end;
 
 	/*
 	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
 	 * However, because this page is allocated from KVM, out-of-bounds
 	 * accesses using the direct map will not be trapped.
 	 */
 	*vaddr += PAGE_SIZE;
 
 	/*
 	 * Allocate physical memory for the page structures, and map it.
 	 */
 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 	vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	vm_page_array_size = page_range;
 
 	return (new_end);
 }
 #endif
 
 /*
  *	vm_page_startup:
  *
  *	Initializes the resident memory module.  Allocates physical memory for
  *	bootstrapping UMA and some data structures that are used to manage
  *	physical pages.  Initializes these structures, and populates the free
  *	page queues.
  */
 vm_offset_t
 vm_page_startup(vm_offset_t vaddr)
 {
 	struct vm_phys_seg *seg;
 	struct vm_domain *vmd;
 	vm_page_t m;
 	char *list, *listend;
 	vm_paddr_t end, high_avail, low_avail, new_end, size;
 	vm_paddr_t page_range __unused;
 	vm_paddr_t last_pa, pa, startp, endp;
 	u_long pagecount;
 #if MINIDUMP_PAGE_TRACKING
 	u_long vm_page_dump_size;
 #endif
 	int biggestone, i, segind;
 #ifdef WITNESS
 	vm_offset_t mapped;
 	int witness_size;
 #endif
 #if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
 	long ii;
 #endif
 
 	vaddr = round_page(vaddr);
 
 	vm_phys_early_startup();
 	biggestone = vm_phys_avail_largest();
 	end = phys_avail[biggestone+1];
 
 	/*
 	 * Initialize the page and queue locks.
 	 */
 	mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF);
 	for (i = 0; i < PA_LOCK_COUNT; i++)
 		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
 	for (i = 0; i < vm_ndomains; i++)
 		vm_page_domain_init(i);
 
 	new_end = end;
 #ifdef WITNESS
 	witness_size = round_page(witness_startup_count());
 	new_end -= witness_size;
 	mapped = pmap_map(&vaddr, new_end, new_end + witness_size,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)mapped, witness_size);
 	witness_startup((void *)mapped);
 #endif
 
 #if MINIDUMP_PAGE_TRACKING
 	/*
 	 * Allocate a bitmap to indicate that a random physical page
 	 * needs to be included in a minidump.
 	 *
 	 * The amd64 port needs this to indicate which direct map pages
 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
 	 *
 	 * However, i386 still needs this workspace internally within the
 	 * minidump code.  In theory, they are not needed on i386, but are
 	 * included should the sf_buf code decide to use them.
 	 */
 	last_pa = 0;
 	vm_page_dump_pages = 0;
 	for (i = 0; dump_avail[i + 1] != 0; i += 2) {
 		vm_page_dump_pages += howmany(dump_avail[i + 1], PAGE_SIZE) -
 		    dump_avail[i] / PAGE_SIZE;
 		if (dump_avail[i + 1] > last_pa)
 			last_pa = dump_avail[i + 1];
 	}
 	vm_page_dump_size = round_page(BITSET_SIZE(vm_page_dump_pages));
 	new_end -= vm_page_dump_size;
 	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
 	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)vm_page_dump, vm_page_dump_size);
 #else
 	(void)last_pa;
 #endif
 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
     defined(__riscv) || defined(__powerpc64__)
 	/*
 	 * Include the UMA bootstrap pages, witness pages and vm_page_dump
 	 * in a crash dump.  When pmap_map() uses the direct map, they are
 	 * not automatically included.
 	 */
 	for (pa = new_end; pa < end; pa += PAGE_SIZE)
 		dump_add_page(pa);
 #endif
 	phys_avail[biggestone + 1] = new_end;
 #ifdef __amd64__
 	/*
 	 * Request that the physical pages underlying the message buffer be
 	 * included in a crash dump.  Since the message buffer is accessed
 	 * through the direct map, they are not automatically included.
 	 */
 	pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
 	last_pa = pa + round_page(msgbufsize);
 	while (pa < last_pa) {
 		dump_add_page(pa);
 		pa += PAGE_SIZE;
 	}
 #endif
 	/*
 	 * Compute the number of pages of memory that will be available for
 	 * use, taking into account the overhead of a page structure per page.
 	 * In other words, solve
 	 *	"available physical memory" - round_page(page_range *
 	 *	    sizeof(struct vm_page)) = page_range * PAGE_SIZE 
 	 * for page_range.  
 	 */
 	low_avail = phys_avail[0];
 	high_avail = phys_avail[1];
 	for (i = 0; i < vm_phys_nsegs; i++) {
 		if (vm_phys_segs[i].start < low_avail)
 			low_avail = vm_phys_segs[i].start;
 		if (vm_phys_segs[i].end > high_avail)
 			high_avail = vm_phys_segs[i].end;
 	}
 	/* Skip the first chunk.  It is already accounted for. */
 	for (i = 2; phys_avail[i + 1] != 0; i += 2) {
 		if (phys_avail[i] < low_avail)
 			low_avail = phys_avail[i];
 		if (phys_avail[i + 1] > high_avail)
 			high_avail = phys_avail[i + 1];
 	}
 	first_page = low_avail / PAGE_SIZE;
 #ifdef VM_PHYSSEG_SPARSE
 	size = 0;
 	for (i = 0; i < vm_phys_nsegs; i++)
 		size += vm_phys_segs[i].end - vm_phys_segs[i].start;
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		size += phys_avail[i + 1] - phys_avail[i];
 #elif defined(VM_PHYSSEG_DENSE)
 	size = high_avail - low_avail;
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
 
 #ifdef PMAP_HAS_PAGE_ARRAY
 	pmap_page_array_startup(size / PAGE_SIZE);
 	biggestone = vm_phys_avail_largest();
 	end = new_end = phys_avail[biggestone + 1];
 #else
 #ifdef VM_PHYSSEG_DENSE
 	/*
 	 * In the VM_PHYSSEG_DENSE case, the number of pages can account for
 	 * the overhead of a page structure per page only if vm_page_array is
 	 * allocated from the last physical memory chunk.  Otherwise, we must
 	 * allocate page structures representing the physical memory
 	 * underlying vm_page_array, even though they will not be used.
 	 */
 	if (new_end != high_avail)
 		page_range = size / PAGE_SIZE;
 	else
 #endif
 	{
 		page_range = size / (PAGE_SIZE + sizeof(struct vm_page));
 
 		/*
 		 * If the partial bytes remaining are large enough for
 		 * a page (PAGE_SIZE) without a corresponding
 		 * 'struct vm_page', then new_end will contain an
 		 * extra page after subtracting the length of the VM
 		 * page array.  Compensate by subtracting an extra
 		 * page from new_end.
 		 */
 		if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) {
 			if (new_end == high_avail)
 				high_avail -= PAGE_SIZE;
 			new_end -= PAGE_SIZE;
 		}
 	}
 	end = new_end;
 	new_end = vm_page_array_alloc(&vaddr, end, page_range);
 #endif
 
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Allocate physical memory for the reservation management system's
 	 * data structures, and map it.
 	 */
 	new_end = vm_reserv_startup(&vaddr, new_end);
 #endif
 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
     defined(__riscv) || defined(__powerpc64__)
 	/*
 	 * Include vm_page_array and vm_reserv_array in a crash dump.
 	 */
 	for (pa = new_end; pa < end; pa += PAGE_SIZE)
 		dump_add_page(pa);
 #endif
 	phys_avail[biggestone + 1] = new_end;
 
 	/*
 	 * Add physical memory segments corresponding to the available
 	 * physical pages.
 	 */
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		if (vm_phys_avail_size(i) != 0)
 			vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
 
 	/*
 	 * Initialize the physical memory allocator.
 	 */
 	vm_phys_init();
 
 	/*
 	 * Initialize the page structures and add every available page to the
 	 * physical memory allocator's free lists.
 	 */
 #if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
 	for (ii = 0; ii < vm_page_array_size; ii++) {
 		m = &vm_page_array[ii];
 		vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0);
 		m->flags = PG_FICTITIOUS;
 	}
 #endif
 	vm_cnt.v_page_count = 0;
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		for (m = seg->first_page, pa = seg->start; pa < seg->end;
 		    m++, pa += PAGE_SIZE)
 			vm_page_init_page(m, pa, segind);
 
 		/*
 		 * Add the segment's pages that are covered by one of
 		 * phys_avail's ranges to the free lists.
 		 */
 		for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 			if (seg->end <= phys_avail[i] ||
 			    seg->start >= phys_avail[i + 1])
 				continue;
 
 			startp = MAX(seg->start, phys_avail[i]);
 			endp = MIN(seg->end, phys_avail[i + 1]);
 			pagecount = (u_long)atop(endp - startp);
 			if (pagecount == 0)
 				continue;
 
 			m = seg->first_page + atop(startp - seg->start);
 			vmd = VM_DOMAIN(seg->domain);
 			vm_domain_free_lock(vmd);
 			vm_phys_enqueue_contig(m, pagecount);
 			vm_domain_free_unlock(vmd);
 			vm_domain_freecnt_inc(vmd, pagecount);
 			vm_cnt.v_page_count += (u_int)pagecount;
 			vmd->vmd_page_count += (u_int)pagecount;
 			vmd->vmd_segs |= 1UL << segind;
 		}
 	}
 
 	/*
 	 * Remove blacklisted pages from the physical memory allocator.
 	 */
 	TAILQ_INIT(&blacklist_head);
 	vm_page_blacklist_load(&list, &listend);
 	vm_page_blacklist_check(list, listend);
 
 	list = kern_getenv("vm.blacklist");
 	vm_page_blacklist_check(list, NULL);
 
 	freeenv(list);
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Initialize the reservation management system.
 	 */
 	vm_reserv_init();
 #endif
 
 	return (vaddr);
 }
 
 void
 vm_page_reference(vm_page_t m)
 {
 
 	vm_page_aflag_set(m, PGA_REFERENCED);
 }
 
 /*
  *	vm_page_trybusy
  *
  *	Helper routine for grab functions to trylock busy.
  *
  *	Returns true on success and false on failure.
  */
 static bool
 vm_page_trybusy(vm_page_t m, int allocflags)
 {
 
 	if ((allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0)
 		return (vm_page_trysbusy(m));
 	else
 		return (vm_page_tryxbusy(m));
 }
 
 /*
  *	vm_page_tryacquire
  *
  *	Helper routine for grab functions to trylock busy and wire.
  *
  *	Returns true on success and false on failure.
  */
 static inline bool
 vm_page_tryacquire(vm_page_t m, int allocflags)
 {
 	bool locked;
 
 	locked = vm_page_trybusy(m, allocflags);
 	if (locked && (allocflags & VM_ALLOC_WIRED) != 0)
 		vm_page_wire(m);
 	return (locked);
 }
 
 /*
  *	vm_page_busy_acquire:
  *
  *	Acquire the busy lock as described by VM_ALLOC_* flags.  Will loop
  *	and drop the object lock if necessary.
  */
 bool
 vm_page_busy_acquire(vm_page_t m, int allocflags)
 {
 	vm_object_t obj;
 	bool locked;
 
 	/*
 	 * The page-specific object must be cached because page
 	 * identity can change during the sleep, causing the
 	 * re-lock of a different object.
 	 * It is assumed that a reference to the object is already
 	 * held by the callers.
 	 */
 	obj = atomic_load_ptr(&m->object);
 	for (;;) {
 		if (vm_page_tryacquire(m, allocflags))
 			return (true);
 		if ((allocflags & VM_ALLOC_NOWAIT) != 0)
 			return (false);
 		if (obj != NULL)
 			locked = VM_OBJECT_WOWNED(obj);
 		else
 			locked = false;
 		MPASS(locked || vm_page_wired(m));
 		if (_vm_page_busy_sleep(obj, m, m->pindex, "vmpba", allocflags,
 		    locked) && locked)
 			VM_OBJECT_WLOCK(obj);
 		if ((allocflags & VM_ALLOC_WAITFAIL) != 0)
 			return (false);
 		KASSERT(m->object == obj || m->object == NULL,
 		    ("vm_page_busy_acquire: page %p does not belong to %p",
 		    m, obj));
 	}
 }
 
 /*
  *	vm_page_busy_downgrade:
  *
  *	Downgrade an exclusive busy page into a single shared busy page.
  */
 void
 vm_page_busy_downgrade(vm_page_t m)
 {
 	u_int x;
 
 	vm_page_assert_xbusied(m);
 
 	x = vm_page_busy_fetch(m);
 	for (;;) {
 		if (atomic_fcmpset_rel_int(&m->busy_lock,
 		    &x, VPB_SHARERS_WORD(1)))
 			break;
 	}
 	if ((x & VPB_BIT_WAITERS) != 0)
 		wakeup(m);
 }
 
 /*
  *
  *	vm_page_busy_tryupgrade:
  *
  *	Attempt to upgrade a single shared busy into an exclusive busy.
  */
 int
 vm_page_busy_tryupgrade(vm_page_t m)
 {
 	u_int ce, x;
 
 	vm_page_assert_sbusied(m);
 
 	x = vm_page_busy_fetch(m);
 	ce = VPB_CURTHREAD_EXCLUSIVE;
 	for (;;) {
 		if (VPB_SHARERS(x) > 1)
 			return (0);
 		KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1),
 		    ("vm_page_busy_tryupgrade: invalid lock state"));
 		if (!atomic_fcmpset_acq_int(&m->busy_lock, &x,
 		    ce | (x & VPB_BIT_WAITERS)))
 			continue;
 		return (1);
 	}
 }
 
 /*
  *	vm_page_sbusied:
  *
  *	Return a positive value if the page is shared busied, 0 otherwise.
  */
 int
 vm_page_sbusied(vm_page_t m)
 {
 	u_int x;
 
 	x = vm_page_busy_fetch(m);
 	return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
 }
 
 /*
  *	vm_page_sunbusy:
  *
  *	Shared unbusy a page.
  */
 void
 vm_page_sunbusy(vm_page_t m)
 {
 	u_int x;
 
 	vm_page_assert_sbusied(m);
 
 	x = vm_page_busy_fetch(m);
 	for (;;) {
 		KASSERT(x != VPB_FREED,
 		    ("vm_page_sunbusy: Unlocking freed page."));
 		if (VPB_SHARERS(x) > 1) {
 			if (atomic_fcmpset_int(&m->busy_lock, &x,
 			    x - VPB_ONE_SHARER))
 				break;
 			continue;
 		}
 		KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1),
 		    ("vm_page_sunbusy: invalid lock state"));
 		if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED))
 			continue;
 		if ((x & VPB_BIT_WAITERS) == 0)
 			break;
 		wakeup(m);
 		break;
 	}
 }
 
 /*
  *	vm_page_busy_sleep:
  *
  *	Sleep if the page is busy, using the page pointer as wchan.
  *	This is used to implement the hard-path of busying mechanism.
  *
  *	If nonshared is true, sleep only if the page is xbusy.
  *
  *	The object lock must be held on entry and will be released on exit.
  */
 void
 vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared)
 {
 	vm_object_t obj;
 
 	obj = m->object;
 	VM_OBJECT_ASSERT_LOCKED(obj);
 	vm_page_lock_assert(m, MA_NOTOWNED);
 
 	if (!_vm_page_busy_sleep(obj, m, m->pindex, wmesg,
 	    nonshared ? VM_ALLOC_SBUSY : 0 , true))
 		VM_OBJECT_DROP(obj);
 }
 
 /*
  *	vm_page_busy_sleep_unlocked:
  *
  *	Sleep if the page is busy, using the page pointer as wchan.
  *	This is used to implement the hard-path of busying mechanism.
  *
  *	If nonshared is true, sleep only if the page is xbusy.
  *
  *	The object lock must not be held on entry.  The operation will
  *	return if the page changes identity.
  */
 void
 vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex,
     const char *wmesg, bool nonshared)
 {
 
 	VM_OBJECT_ASSERT_UNLOCKED(obj);
 	vm_page_lock_assert(m, MA_NOTOWNED);
 
 	_vm_page_busy_sleep(obj, m, pindex, wmesg,
 	    nonshared ? VM_ALLOC_SBUSY : 0, false);
 }
 
 /*
  *	_vm_page_busy_sleep:
  *
  *	Internal busy sleep function.  Verifies the page identity and
  *	lockstate against parameters.  Returns true if it sleeps and
  *	false otherwise.
  *
  *	If locked is true the lock will be dropped for any true returns
  *	and held for any false returns.
  */
 static bool
 _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex,
     const char *wmesg, int allocflags, bool locked)
 {
 	bool xsleep;
 	u_int x;
 
 	/*
 	 * If the object is busy we must wait for that to drain to zero
 	 * before trying the page again.
 	 */
 	if (obj != NULL && vm_object_busied(obj)) {
 		if (locked)
 			VM_OBJECT_DROP(obj);
 		vm_object_busy_wait(obj, wmesg);
 		return (true);
 	}
 
 	if (!vm_page_busied(m))
 		return (false);
 
 	xsleep = (allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0;
 	sleepq_lock(m);
 	x = vm_page_busy_fetch(m);
 	do {
 		/*
 		 * If the page changes objects or becomes unlocked we can
 		 * simply return.
 		 */
 		if (x == VPB_UNBUSIED ||
 		    (xsleep && (x & VPB_BIT_SHARED) != 0) ||
 		    m->object != obj || m->pindex != pindex) {
 			sleepq_release(m);
 			return (false);
 		}
 		if ((x & VPB_BIT_WAITERS) != 0)
 			break;
 	} while (!atomic_fcmpset_int(&m->busy_lock, &x, x | VPB_BIT_WAITERS));
 	if (locked)
 		VM_OBJECT_DROP(obj);
 	DROP_GIANT();
 	sleepq_add(m, NULL, wmesg, 0, 0);
 	sleepq_wait(m, PVM);
 	PICKUP_GIANT();
 	return (true);
 }
 
 /*
  *	vm_page_trysbusy:
  *
  *	Try to shared busy a page.
  *	If the operation succeeds 1 is returned otherwise 0.
  *	The operation never sleeps.
  */
 int
 vm_page_trysbusy(vm_page_t m)
 {
 	vm_object_t obj;
 	u_int x;
 
 	obj = m->object;
 	x = vm_page_busy_fetch(m);
 	for (;;) {
 		if ((x & VPB_BIT_SHARED) == 0)
 			return (0);
 		/*
 		 * Reduce the window for transient busies that will trigger
 		 * false negatives in vm_page_ps_test().
 		 */
 		if (obj != NULL && vm_object_busied(obj))
 			return (0);
 		if (atomic_fcmpset_acq_int(&m->busy_lock, &x,
 		    x + VPB_ONE_SHARER))
 			break;
 	}
 
 	/* Refetch the object now that we're guaranteed that it is stable. */
 	obj = m->object;
 	if (obj != NULL && vm_object_busied(obj)) {
 		vm_page_sunbusy(m);
 		return (0);
 	}
 	return (1);
 }
 
 /*
  *	vm_page_tryxbusy:
  *
  *	Try to exclusive busy a page.
  *	If the operation succeeds 1 is returned otherwise 0.
  *	The operation never sleeps.
  */
 int
 vm_page_tryxbusy(vm_page_t m)
 {
 	vm_object_t obj;
 
         if (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED,
             VPB_CURTHREAD_EXCLUSIVE) == 0)
 		return (0);
 
 	obj = m->object;
 	if (obj != NULL && vm_object_busied(obj)) {
 		vm_page_xunbusy(m);
 		return (0);
 	}
 	return (1);
 }
 
 static void
 vm_page_xunbusy_hard_tail(vm_page_t m)
 {
 	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
 	/* Wake the waiter. */
 	wakeup(m);
 }
 
 /*
  *	vm_page_xunbusy_hard:
  *
  *	Called when unbusy has failed because there is a waiter.
  */
 void
 vm_page_xunbusy_hard(vm_page_t m)
 {
 	vm_page_assert_xbusied(m);
 	vm_page_xunbusy_hard_tail(m);
 }
 
 void
 vm_page_xunbusy_hard_unchecked(vm_page_t m)
 {
 	vm_page_assert_xbusied_unchecked(m);
 	vm_page_xunbusy_hard_tail(m);
 }
 
 static void
 vm_page_busy_free(vm_page_t m)
 {
 	u_int x;
 
 	atomic_thread_fence_rel();
 	x = atomic_swap_int(&m->busy_lock, VPB_FREED);
 	if ((x & VPB_BIT_WAITERS) != 0)
 		wakeup(m);
 }
 
 /*
  *	vm_page_unhold_pages:
  *
  *	Unhold each of the pages that is referenced by the given array.
  */
 void
 vm_page_unhold_pages(vm_page_t *ma, int count)
 {
 
 	for (; count != 0; count--) {
 		vm_page_unwire(*ma, PQ_ACTIVE);
 		ma++;
 	}
 }
 
 vm_page_t
 PHYS_TO_VM_PAGE(vm_paddr_t pa)
 {
 	vm_page_t m;
 
 #ifdef VM_PHYSSEG_SPARSE
 	m = vm_phys_paddr_to_vm_page(pa);
 	if (m == NULL)
 		m = vm_phys_fictitious_to_vm_page(pa);
 	return (m);
 #elif defined(VM_PHYSSEG_DENSE)
 	long pi;
 
 	pi = atop(pa);
 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 		m = &vm_page_array[pi - first_page];
 		return (m);
 	}
 	return (vm_phys_fictitious_to_vm_page(pa));
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
 }
 
 /*
  *	vm_page_getfake:
  *
  *	Create a fictitious page with the specified physical address and
  *	memory attribute.  The memory attribute is the only the machine-
  *	dependent aspect of a fictitious page that must be initialized.
  */
 vm_page_t
 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
 {
 	vm_page_t m;
 
 	m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
 	vm_page_initfake(m, paddr, memattr);
 	return (m);
 }
 
 void
 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 {
 
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		/*
 		 * The page's memattr might have changed since the
 		 * previous initialization.  Update the pmap to the
 		 * new memattr.
 		 */
 		goto memattr;
 	}
 	m->phys_addr = paddr;
 	m->a.queue = PQ_NONE;
 	/* Fictitious pages don't use "segind". */
 	m->flags = PG_FICTITIOUS;
 	/* Fictitious pages don't use "order" or "pool". */
 	m->oflags = VPO_UNMANAGED;
 	m->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
 	/* Fictitious pages are unevictable. */
 	m->ref_count = 1;
 	pmap_page_init(m);
 memattr:
 	pmap_page_set_memattr(m, memattr);
 }
 
 /*
  *	vm_page_putfake:
  *
  *	Release a fictitious page.
  */
 void
 vm_page_putfake(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
 	KASSERT((m->flags & PG_FICTITIOUS) != 0,
 	    ("vm_page_putfake: bad page %p", m));
 	vm_page_assert_xbusied(m);
 	vm_page_busy_free(m);
 	uma_zfree(fakepg_zone, m);
 }
 
 /*
  *	vm_page_updatefake:
  *
  *	Update the given fictitious page to the specified physical address and
  *	memory attribute.
  */
 void
 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 {
 
 	KASSERT((m->flags & PG_FICTITIOUS) != 0,
 	    ("vm_page_updatefake: bad page %p", m));
 	m->phys_addr = paddr;
 	pmap_page_set_memattr(m, memattr);
 }
 
 /*
  *	vm_page_free:
  *
  *	Free a page.
  */
 void
 vm_page_free(vm_page_t m)
 {
 
 	m->flags &= ~PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  *	vm_page_free_zero:
  *
  *	Free a page to the zerod-pages queue
  */
 void
 vm_page_free_zero(vm_page_t m)
 {
 
 	m->flags |= PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  * Unbusy and handle the page queueing for a page from a getpages request that
  * was optionally read ahead or behind.
  */
 void
 vm_page_readahead_finish(vm_page_t m)
 {
 
 	/* We shouldn't put invalid pages on queues. */
 	KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m));
 
 	/*
 	 * Since the page is not the actually needed one, whether it should
 	 * be activated or deactivated is not obvious.  Empirical results
 	 * have shown that deactivating the page is usually the best choice,
 	 * unless the page is wanted by another thread.
 	 */
 	if ((vm_page_busy_fetch(m) & VPB_BIT_WAITERS) != 0)
 		vm_page_activate(m);
 	else
 		vm_page_deactivate(m);
 	vm_page_xunbusy_unchecked(m);
 }
 
 /*
  * Destroy the identity of an invalid page and free it if possible.
  * This is intended to be used when reading a page from backing store fails.
  */
 void
 vm_page_free_invalid(vm_page_t m)
 {
 
 	KASSERT(vm_page_none_valid(m), ("page %p is valid", m));
 	KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m));
 	KASSERT(m->object != NULL, ("page %p has no object", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 
 	/*
 	 * We may be attempting to free the page as part of the handling for an
 	 * I/O error, in which case the page was xbusied by a different thread.
 	 */
 	vm_page_xbusy_claim(m);
 
 	/*
 	 * If someone has wired this page while the object lock
 	 * was not held, then the thread that unwires is responsible
 	 * for freeing the page.  Otherwise just free the page now.
 	 * The wire count of this unmapped page cannot change while
 	 * we have the page xbusy and the page's object wlocked.
 	 */
 	if (vm_page_remove(m))
 		vm_page_free(m);
 }
 
 /*
  *	vm_page_sleep_if_busy:
  *
  *	Sleep and release the object lock if the page is busied.
  *	Returns TRUE if the thread slept.
  *
  *	The given page must be unlocked and object containing it must
  *	be locked.
  */
 int
 vm_page_sleep_if_busy(vm_page_t m, const char *wmesg)
 {
 	vm_object_t obj;
 
 	vm_page_lock_assert(m, MA_NOTOWNED);
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 
 	/*
 	 * The page-specific object must be cached because page
 	 * identity can change during the sleep, causing the
 	 * re-lock of a different object.
 	 * It is assumed that a reference to the object is already
 	 * held by the callers.
 	 */
 	obj = m->object;
 	if (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, 0, true)) {
 		VM_OBJECT_WLOCK(obj);
 		return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  *	vm_page_sleep_if_xbusy:
  *
  *	Sleep and release the object lock if the page is xbusied.
  *	Returns TRUE if the thread slept.
  *
  *	The given page must be unlocked and object containing it must
  *	be locked.
  */
 int
 vm_page_sleep_if_xbusy(vm_page_t m, const char *wmesg)
 {
 	vm_object_t obj;
 
 	vm_page_lock_assert(m, MA_NOTOWNED);
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 
 	/*
 	 * The page-specific object must be cached because page
 	 * identity can change during the sleep, causing the
 	 * re-lock of a different object.
 	 * It is assumed that a reference to the object is already
 	 * held by the callers.
 	 */
 	obj = m->object;
 	if (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, VM_ALLOC_SBUSY,
 	    true)) {
 		VM_OBJECT_WLOCK(obj);
 		return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  *	vm_page_dirty_KBI:		[ internal use only ]
  *
  *	Set all bits in the page's dirty field.
  *
  *	The object containing the specified page must be locked if the
  *	call is made from the machine-independent layer.
  *
  *	See vm_page_clear_dirty_mask().
  *
  *	This function should only be called by vm_page_dirty().
  */
 void
 vm_page_dirty_KBI(vm_page_t m)
 {
 
 	/* Refer to this operation by its public name. */
 	KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!"));
 	m->dirty = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_insert:		[ internal use only ]
  *
  *	Inserts the given mem entry into the object and object list.
  *
  *	The object must be locked.
  */
 int
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t mpred;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	mpred = vm_radix_lookup_le(&object->rtree, pindex);
 	return (vm_page_insert_after(m, object, pindex, mpred));
 }
 
 /*
  *	vm_page_insert_after:
  *
  *	Inserts the page "m" into the specified object at offset "pindex".
  *
  *	The page "mpred" must immediately precede the offset "pindex" within
  *	the specified object.
  *
  *	The object must be locked.
  */
 static int
 vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
     vm_page_t mpred)
 {
 	vm_page_t msucc;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(m->object == NULL,
 	    ("vm_page_insert_after: page already inserted"));
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_page_insert_after: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < pindex,
 		    ("vm_page_insert_after: mpred doesn't precede pindex"));
 		msucc = TAILQ_NEXT(mpred, listq);
 	} else
 		msucc = TAILQ_FIRST(&object->memq);
 	if (msucc != NULL)
 		KASSERT(msucc->pindex > pindex,
 		    ("vm_page_insert_after: msucc doesn't succeed pindex"));
 
 	/*
 	 * Record the object/offset pair in this page.
 	 */
 	m->object = object;
 	m->pindex = pindex;
 	m->ref_count |= VPRC_OBJREF;
 
 	/*
 	 * Now link into the object's ordered list of backed pages.
 	 */
 	if (vm_radix_insert(&object->rtree, m)) {
 		m->object = NULL;
 		m->pindex = 0;
 		m->ref_count &= ~VPRC_OBJREF;
 		return (1);
 	}
 	vm_page_insert_radixdone(m, object, mpred);
 	return (0);
 }
 
 /*
  *	vm_page_insert_radixdone:
  *
  *	Complete page "m" insertion into the specified object after the
  *	radix trie hooking.
  *
  *	The page "mpred" must precede the offset "m->pindex" within the
  *	specified object.
  *
  *	The object must be locked.
  */
 static void
 vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object != NULL && m->object == object,
 	    ("vm_page_insert_radixdone: page %p has inconsistent object", m));
 	KASSERT((m->ref_count & VPRC_OBJREF) != 0,
 	    ("vm_page_insert_radixdone: page %p is missing object ref", m));
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_page_insert_radixdone: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < m->pindex,
 		    ("vm_page_insert_radixdone: mpred doesn't precede pindex"));
 	}
 
 	if (mpred != NULL)
 		TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
 	else
 		TAILQ_INSERT_HEAD(&object->memq, m, listq);
 
 	/*
 	 * Show that the object has one more resident page.
 	 */
 	object->resident_page_count++;
 
 	/*
 	 * Hold the vnode until the last page is released.
 	 */
 	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
 		vhold(object->handle);
 
 	/*
 	 * Since we are inserting a new and possibly dirty page,
 	 * update the object's generation count.
 	 */
 	if (pmap_page_is_write_mapped(m))
 		vm_object_set_writeable_dirty(object);
 }
 
 /*
  * Do the work to remove a page from its object.  The caller is responsible for
  * updating the page's fields to reflect this removal.
  */
 static void
 vm_page_object_remove(vm_page_t m)
 {
 	vm_object_t object;
 	vm_page_t mrem;
 
 	vm_page_assert_xbusied(m);
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((m->ref_count & VPRC_OBJREF) != 0,
 	    ("page %p is missing its object ref", m));
 
 	/* Deferred free of swap space. */
 	if ((m->a.flags & PGA_SWAP_FREE) != 0)
 		vm_pager_page_unswapped(m);
 
 	m->object = NULL;
 	mrem = vm_radix_remove(&object->rtree, m->pindex);
 	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
 
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
 	 * And show that the object has one fewer resident page.
 	 */
 	object->resident_page_count--;
 
 	/*
 	 * The vnode may now be recycled.
 	 */
 	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
 		vdrop(object->handle);
 }
 
 /*
  *	vm_page_remove:
  *
  *	Removes the specified page from its containing object, but does not
  *	invalidate any backing storage.  Returns true if the object's reference
  *	was the last reference to the page, and false otherwise.
  *
  *	The object must be locked and the page must be exclusively busied.
  *	The exclusive busy will be released on return.  If this is not the
  *	final ref and the caller does not hold a wire reference it may not
  *	continue to access the page.
  */
 bool
 vm_page_remove(vm_page_t m)
 {
 	bool dropped;
 
 	dropped = vm_page_remove_xbusy(m);
 	vm_page_xunbusy(m);
 
 	return (dropped);
 }
 
 /*
  *	vm_page_remove_xbusy
  *
  *	Removes the page but leaves the xbusy held.  Returns true if this
  *	removed the final ref and false otherwise.
  */
 bool
 vm_page_remove_xbusy(vm_page_t m)
 {
 
 	vm_page_object_remove(m);
 	return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF);
 }
 
 /*
  *	vm_page_lookup:
  *
  *	Returns the page associated with the object/offset
  *	pair specified; if none is found, NULL is returned.
  *
  *	The object must be locked.
  */
 vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	return (vm_radix_lookup(&object->rtree, pindex));
 }
 
 /*
  *	vm_page_lookup_unlocked:
  *
  *	Returns the page associated with the object/offset pair specified;
  *	if none is found, NULL is returned.  The page may be no longer be
  *	present in the object at the time that this function returns.  Only
  *	useful for opportunistic checks such as inmem().
  */
 vm_page_t
 vm_page_lookup_unlocked(vm_object_t object, vm_pindex_t pindex)
 {
 
 	return (vm_radix_lookup_unlocked(&object->rtree, pindex));
 }
 
 /*
  *	vm_page_relookup:
  *
  *	Returns a page that must already have been busied by
  *	the caller.  Used for bogus page replacement.
  */
 vm_page_t
 vm_page_relookup(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	m = vm_radix_lookup_unlocked(&object->rtree, pindex);
 	KASSERT(m != NULL && (vm_page_busied(m) || vm_page_wired(m)) &&
 	    m->object == object && m->pindex == pindex,
 	    ("vm_page_relookup: Invalid page %p", m));
 	return (m);
 }
 
 /*
  * This should only be used by lockless functions for releasing transient
  * incorrect acquires.  The page may have been freed after we acquired a
  * busy lock.  In this case busy_lock == VPB_FREED and we have nothing
  * further to do.
  */
 static void
 vm_page_busy_release(vm_page_t m)
 {
 	u_int x;
 
 	x = vm_page_busy_fetch(m);
 	for (;;) {
 		if (x == VPB_FREED)
 			break;
 		if ((x & VPB_BIT_SHARED) != 0 && VPB_SHARERS(x) > 1) {
 			if (atomic_fcmpset_int(&m->busy_lock, &x,
 			    x - VPB_ONE_SHARER))
 				break;
 			continue;
 		}
 		KASSERT((x & VPB_BIT_SHARED) != 0 ||
 		    (x & ~VPB_BIT_WAITERS) == VPB_CURTHREAD_EXCLUSIVE,
 		    ("vm_page_busy_release: %p xbusy not owned.", m));
 		if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED))
 			continue;
 		if ((x & VPB_BIT_WAITERS) != 0)
 			wakeup(m);
 		break;
 	}
 }
 
 /*
  *	vm_page_find_least:
  *
  *	Returns the page associated with the object with least pindex
  *	greater than or equal to the parameter pindex, or NULL.
  *
  *	The object must be locked.
  */
 vm_page_t
 vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
 		m = vm_radix_lookup_ge(&object->rtree, pindex);
 	return (m);
 }
 
 /*
  * Returns the given page's successor (by pindex) within the object if it is
  * resident; if none is found, NULL is returned.
  *
  * The object must be locked.
  */
 vm_page_t
 vm_page_next(vm_page_t m)
 {
 	vm_page_t next;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if ((next = TAILQ_NEXT(m, listq)) != NULL) {
 		MPASS(next->object == m->object);
 		if (next->pindex != m->pindex + 1)
 			next = NULL;
 	}
 	return (next);
 }
 
 /*
  * Returns the given page's predecessor (by pindex) within the object if it is
  * resident; if none is found, NULL is returned.
  *
  * The object must be locked.
  */
 vm_page_t
 vm_page_prev(vm_page_t m)
 {
 	vm_page_t prev;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
 		MPASS(prev->object == m->object);
 		if (prev->pindex != m->pindex - 1)
 			prev = NULL;
 	}
 	return (prev);
 }
 
 /*
  * Uses the page mnew as a replacement for an existing page at index
  * pindex which must be already present in the object.
  *
  * Both pages must be exclusively busied on enter.  The old page is
  * unbusied on exit.
  *
  * A return value of true means mold is now free.  If this is not the
  * final ref and the caller does not hold a wire reference it may not
  * continue to access the page.
  */
 static bool
 vm_page_replace_hold(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
     vm_page_t mold)
 {
 	vm_page_t mret;
 	bool dropped;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_page_assert_xbusied(mold);
 	KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0,
 	    ("vm_page_replace: page %p already in object", mnew));
 
 	/*
 	 * This function mostly follows vm_page_insert() and
 	 * vm_page_remove() without the radix, object count and vnode
 	 * dance.  Double check such functions for more comments.
 	 */
 
 	mnew->object = object;
 	mnew->pindex = pindex;
 	atomic_set_int(&mnew->ref_count, VPRC_OBJREF);
 	mret = vm_radix_replace(&object->rtree, mnew);
 	KASSERT(mret == mold,
 	    ("invalid page replacement, mold=%p, mret=%p", mold, mret));
 	KASSERT((mold->oflags & VPO_UNMANAGED) ==
 	    (mnew->oflags & VPO_UNMANAGED),
 	    ("vm_page_replace: mismatched VPO_UNMANAGED"));
 
 	/* Keep the resident page list in sorted order. */
 	TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq);
 	TAILQ_REMOVE(&object->memq, mold, listq);
 	mold->object = NULL;
 
 	/*
 	 * The object's resident_page_count does not change because we have
 	 * swapped one page for another, but the generation count should
 	 * change if the page is dirty.
 	 */
 	if (pmap_page_is_write_mapped(mnew))
 		vm_object_set_writeable_dirty(object);
 	dropped = vm_page_drop(mold, VPRC_OBJREF) == VPRC_OBJREF;
 	vm_page_xunbusy(mold);
 
 	return (dropped);
 }
 
 void
 vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
     vm_page_t mold)
 {
 
 	vm_page_assert_xbusied(mnew);
 
 	if (vm_page_replace_hold(mnew, object, pindex, mold))
 		vm_page_free(mold);
 }
 
 /*
  *	vm_page_rename:
  *
  *	Move the given memory entry from its
  *	current object to the specified target object/offset.
  *
  *	Note: swap associated with the page must be invalidated by the move.  We
  *	      have to do this for several reasons:  (1) we aren't freeing the
  *	      page, (2) we are dirtying the page, (3) the VM system is probably
  *	      moving the page from object A to B, and will then later move
  *	      the backing store from A to B and we can't have a conflict.
  *
  *	Note: we *always* dirty the page.  It is necessary both for the
  *	      fact that we moved it, and because we may be invalidating
  *	      swap.
  *
  *	The objects must be locked.
  */
 int
 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 {
 	vm_page_t mpred;
 	vm_pindex_t opidx;
 
 	VM_OBJECT_ASSERT_WLOCKED(new_object);
 
 	KASSERT(m->ref_count != 0, ("vm_page_rename: page %p has no refs", m));
 	mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
 	KASSERT(mpred == NULL || mpred->pindex != new_pindex,
 	    ("vm_page_rename: pindex already renamed"));
 
 	/*
 	 * Create a custom version of vm_page_insert() which does not depend
 	 * by m_prev and can cheat on the implementation aspects of the
 	 * function.
 	 */
 	opidx = m->pindex;
 	m->pindex = new_pindex;
 	if (vm_radix_insert(&new_object->rtree, m)) {
 		m->pindex = opidx;
 		return (1);
 	}
 
 	/*
 	 * The operation cannot fail anymore.  The removal must happen before
 	 * the listq iterator is tainted.
 	 */
 	m->pindex = opidx;
 	vm_page_object_remove(m);
 
 	/* Return back to the new pindex to complete vm_page_insert(). */
 	m->pindex = new_pindex;
 	m->object = new_object;
 
 	vm_page_insert_radixdone(m, new_object, mpred);
 	vm_page_dirty(m);
 	return (0);
 }
 
 /*
  *	vm_page_alloc:
  *
  *	Allocate and return a page that is associated with the specified
  *	object and offset pair.  By default, this page is exclusive busied.
  *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
  *				intends to allocate
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
  *				should not be exclusive busy
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
  */
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 {
 
 	return (vm_page_alloc_after(object, pindex, req, object != NULL ?
 	    vm_radix_lookup_le(&object->rtree, pindex) : NULL));
 }
 
 vm_page_t
 vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain,
     int req)
 {
 
 	return (vm_page_alloc_domain_after(object, pindex, domain, req,
 	    object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) :
 	    NULL));
 }
 
 /*
  * Allocate a page in the specified object with the given page index.  To
  * optimize insertion of the page into the object, the caller must also specifiy
  * the resident page in the object with largest index smaller than the given
  * page index, or NULL if no such page exists.
  */
 vm_page_t
 vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex,
     int req, vm_page_t mpred)
 {
 	struct vm_domainset_iter di;
 	vm_page_t m;
 	int domain;
 
 	vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
 	do {
 		m = vm_page_alloc_domain_after(object, pindex, domain, req,
 		    mpred);
 		if (m != NULL)
 			break;
 	} while (vm_domainset_iter_page(&di, object, &domain) == 0);
 
 	return (m);
 }
 
 /*
  * Returns true if the number of free pages exceeds the minimum
  * for the request class and false otherwise.
  */
 static int
 _vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages)
 {
 	u_int limit, old, new;
 
 	if (req_class == VM_ALLOC_INTERRUPT)
 		limit = 0;
 	else if (req_class == VM_ALLOC_SYSTEM)
 		limit = vmd->vmd_interrupt_free_min;
 	else
 		limit = vmd->vmd_free_reserved;
 
 	/*
 	 * Attempt to reserve the pages.  Fail if we're below the limit.
 	 */
 	limit += npages;
 	old = vmd->vmd_free_count;
 	do {
 		if (old < limit)
 			return (0);
 		new = old - npages;
 	} while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0);
 
 	/* Wake the page daemon if we've crossed the threshold. */
 	if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old))
 		pagedaemon_wakeup(vmd->vmd_domain);
 
 	/* Only update bitsets on transitions. */
 	if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) ||
 	    (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe))
 		vm_domain_set(vmd);
 
 	return (1);
 }
 
 int
 vm_domain_allocate(struct vm_domain *vmd, int req, int npages)
 {
 	int req_class;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	req_class = req & VM_ALLOC_CLASS_MASK;
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 	return (_vm_domain_allocate(vmd, req_class, npages));
 }
 
 vm_page_t
 vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain,
     int req, vm_page_t mpred)
 {
 	struct vm_domain *vmd;
 	vm_page_t m;
 	int flags, pool;
 
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
 	    ("inconsistent object(%p)/req(%x)", object, req));
 	KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
 	    ("Can't sleep and retry object insertion."));
 	KASSERT(mpred == NULL || mpred->pindex < pindex,
 	    ("mpred %p doesn't precede pindex 0x%jx", mpred,
 	    (uintmax_t)pindex));
 	if (object != NULL)
 		VM_OBJECT_ASSERT_WLOCKED(object);
 
 	flags = 0;
 	m = NULL;
 	pool = object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT;
 again:
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Can we allocate the page from a reservation?
 	 */
 	if (vm_object_reserv(object) &&
 	    (m = vm_reserv_alloc_page(object, pindex, domain, req, mpred)) !=
 	    NULL) {
 		goto found;
 	}
 #endif
 	vmd = VM_DOMAIN(domain);
 	if (vmd->vmd_pgcache[pool].zone != NULL) {
 		m = uma_zalloc(vmd->vmd_pgcache[pool].zone, M_NOWAIT | M_NOVM);
 		if (m != NULL) {
 			flags |= PG_PCPU_CACHE;
 			goto found;
 		}
 	}
 	if (vm_domain_allocate(vmd, req, 1)) {
 		/*
 		 * If not, allocate it from the free page queues.
 		 */
 		vm_domain_free_lock(vmd);
 		m = vm_phys_alloc_pages(domain, pool, 0);
 		vm_domain_free_unlock(vmd);
 		if (m == NULL) {
 			vm_domain_freecnt_inc(vmd, 1);
 #if VM_NRESERVLEVEL > 0
 			if (vm_reserv_reclaim_inactive(domain))
 				goto again;
 #endif
 		}
 	}
 	if (m == NULL) {
 		/*
 		 * Not allocatable, give up.
 		 */
 		if (vm_domain_alloc_fail(vmd, object, req))
 			goto again;
 		return (NULL);
 	}
 
 	/*
 	 * At this point we had better have found a good page.
 	 */
 found:
 	vm_page_dequeue(m);
 	vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the page.  Only the PG_ZERO flag is inherited.
 	 */
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags |= (m->flags & PG_ZERO);
 	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
 	m->flags = flags;
 	m->a.flags = 0;
 	m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
 	    VPO_UNMANAGED : 0;
 	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
 		m->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
 	else if ((req & VM_ALLOC_SBUSY) != 0)
 		m->busy_lock = VPB_SHARERS_WORD(1);
 	else
 		m->busy_lock = VPB_UNBUSIED;
 	if (req & VM_ALLOC_WIRED) {
 		vm_wire_add(1);
 		m->ref_count = 1;
 	}
 	m->a.act_count = 0;
 
 	if (object != NULL) {
 		if (vm_page_insert_after(m, object, pindex, mpred)) {
 			if (req & VM_ALLOC_WIRED) {
 				vm_wire_sub(1);
 				m->ref_count = 0;
 			}
 			KASSERT(m->object == NULL, ("page %p has object", m));
 			m->oflags = VPO_UNMANAGED;
 			m->busy_lock = VPB_UNBUSIED;
 			/* Don't change PG_ZERO. */
 			vm_page_free_toq(m);
 			if (req & VM_ALLOC_WAITFAIL) {
 				VM_OBJECT_WUNLOCK(object);
 				vm_radix_wait();
 				VM_OBJECT_WLOCK(object);
 			}
 			return (NULL);
 		}
 
 		/* Ignore device objects; the pager sets "memattr" for them. */
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
 		    (object->flags & OBJ_FICTITIOUS) == 0)
 			pmap_page_set_memattr(m, object->memattr);
 	} else
 		m->pindex = pindex;
 
 	return (m);
 }
 
 /*
  *	vm_page_alloc_contig:
  *
  *	Allocate a contiguous set of physical pages of the given size "npages"
  *	from the free lists.  All of the physical pages must be at or above
  *	the given physical address "low" and below the given physical address
  *	"high".  The given value "alignment" determines the alignment of the
  *	first physical page in the set.  If the given value "boundary" is
  *	non-zero, then the set of physical pages cannot cross any physical
  *	address boundary that is a multiple of that value.  Both "alignment"
  *	and "boundary" must be a power of two.
  *
  *	If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
  *	then the memory attribute setting for the physical pages is configured
  *	to the object's memory attribute setting.  Otherwise, the memory
  *	attribute setting for the physical pages is configured to "memattr",
  *	overriding the object's memory attribute setting.  However, if the
  *	object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
  *	memory attribute setting for the physical pages cannot be configured
  *	to VM_MEMATTR_DEFAULT.
  *
  *	The specified object may not contain fictitious pages.
  *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
  *				should not be exclusive busy
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
  */
 vm_page_t
 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_memattr_t memattr)
 {
 	struct vm_domainset_iter di;
 	vm_page_t m;
 	int domain;
 
 	vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
 	do {
 		m = vm_page_alloc_contig_domain(object, pindex, domain, req,
 		    npages, low, high, alignment, boundary, memattr);
 		if (m != NULL)
 			break;
 	} while (vm_domainset_iter_page(&di, object, &domain) == 0);
 
 	return (m);
 }
 
 vm_page_t
 vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain,
     int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_memattr_t memattr)
 {
 	struct vm_domain *vmd;
 	vm_page_t m, m_ret, mpred;
 	u_int busy_lock, flags, oflags;
 
 	mpred = NULL;	/* XXX: pacify gcc */
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
 	    ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object,
 	    req));
 	KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
 	    ("Can't sleep and retry object insertion."));
 	if (object != NULL) {
 		VM_OBJECT_ASSERT_WLOCKED(object);
 		KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
 		    ("vm_page_alloc_contig: object %p has fictitious pages",
 		    object));
 	}
 	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
 
 	if (object != NULL) {
 		mpred = vm_radix_lookup_le(&object->rtree, pindex);
 		KASSERT(mpred == NULL || mpred->pindex != pindex,
 		    ("vm_page_alloc_contig: pindex already allocated"));
 	}
 
 	/*
 	 * Can we allocate the pages without the number of free pages falling
 	 * below the lower bound for the allocation class?
 	 */
 	m_ret = NULL;
 again:
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Can we allocate the pages from a reservation?
 	 */
 	if (vm_object_reserv(object) &&
 	    (m_ret = vm_reserv_alloc_contig(object, pindex, domain, req,
 	    mpred, npages, low, high, alignment, boundary)) != NULL) {
 		goto found;
 	}
 #endif
 	vmd = VM_DOMAIN(domain);
 	if (vm_domain_allocate(vmd, req, npages)) {
 		/*
 		 * allocate them from the free page queues.
 		 */
 		vm_domain_free_lock(vmd);
 		m_ret = vm_phys_alloc_contig(domain, npages, low, high,
 		    alignment, boundary);
 		vm_domain_free_unlock(vmd);
 		if (m_ret == NULL) {
 			vm_domain_freecnt_inc(vmd, npages);
 #if VM_NRESERVLEVEL > 0
 			if (vm_reserv_reclaim_contig(domain, npages, low,
 			    high, alignment, boundary))
 				goto again;
 #endif
 		}
 	}
 	if (m_ret == NULL) {
 		if (vm_domain_alloc_fail(vmd, object, req))
 			goto again;
 		return (NULL);
 	}
 #if VM_NRESERVLEVEL > 0
 found:
 #endif
 	for (m = m_ret; m < &m_ret[npages]; m++) {
 		vm_page_dequeue(m);
 		vm_page_alloc_check(m);
 	}
 
 	/*
 	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags = PG_ZERO;
 	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
 	oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
 	    VPO_UNMANAGED : 0;
 	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
 		busy_lock = VPB_CURTHREAD_EXCLUSIVE;
 	else if ((req & VM_ALLOC_SBUSY) != 0)
 		busy_lock = VPB_SHARERS_WORD(1);
 	else
 		busy_lock = VPB_UNBUSIED;
 	if ((req & VM_ALLOC_WIRED) != 0)
 		vm_wire_add(npages);
 	if (object != NULL) {
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
 		    memattr == VM_MEMATTR_DEFAULT)
 			memattr = object->memattr;
 	}
 	for (m = m_ret; m < &m_ret[npages]; m++) {
 		m->a.flags = 0;
 		m->flags = (m->flags | PG_NODUMP) & flags;
 		m->busy_lock = busy_lock;
 		if ((req & VM_ALLOC_WIRED) != 0)
 			m->ref_count = 1;
 		m->a.act_count = 0;
 		m->oflags = oflags;
 		if (object != NULL) {
 			if (vm_page_insert_after(m, object, pindex, mpred)) {
 				if ((req & VM_ALLOC_WIRED) != 0)
 					vm_wire_sub(npages);
 				KASSERT(m->object == NULL,
 				    ("page %p has object", m));
 				mpred = m;
 				for (m = m_ret; m < &m_ret[npages]; m++) {
 					if (m <= mpred &&
 					    (req & VM_ALLOC_WIRED) != 0)
 						m->ref_count = 0;
 					m->oflags = VPO_UNMANAGED;
 					m->busy_lock = VPB_UNBUSIED;
 					/* Don't change PG_ZERO. */
 					vm_page_free_toq(m);
 				}
 				if (req & VM_ALLOC_WAITFAIL) {
 					VM_OBJECT_WUNLOCK(object);
 					vm_radix_wait();
 					VM_OBJECT_WLOCK(object);
 				}
 				return (NULL);
 			}
 			mpred = m;
 		} else
 			m->pindex = pindex;
 		if (memattr != VM_MEMATTR_DEFAULT)
 			pmap_page_set_memattr(m, memattr);
 		pindex++;
 	}
 	return (m_ret);
 }
 
 /*
  * Allocate a physical page that is not intended to be inserted into a VM
  * object.  If the "freelist" parameter is not equal to VM_NFREELIST, then only
  * pages from the specified vm_phys freelist will be returned.
  */
 static __always_inline vm_page_t
 _vm_page_alloc_noobj_domain(int domain, const int freelist, int req)
 {
 	struct vm_domain *vmd;
 	vm_page_t m;
 	int flags;
 
 	KASSERT((req & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 	    VM_ALLOC_NOOBJ)) == 0,
 	    ("%s: invalid req %#x", __func__, req));
 
 	flags = (req & VM_ALLOC_NODUMP) != 0 ? PG_NODUMP : 0;
 	vmd = VM_DOMAIN(domain);
 again:
 	if (freelist == VM_NFREELIST &&
 	    vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone != NULL) {
 		m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone,
 		    M_NOWAIT | M_NOVM);
 		if (m != NULL) {
 			flags |= PG_PCPU_CACHE;
 			goto found;
 		}
 	}
 
 	if (vm_domain_allocate(vmd, req, 1)) {
 		vm_domain_free_lock(vmd);
 		if (freelist == VM_NFREELIST)
 			m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DIRECT, 0);
 		else
 			m = vm_phys_alloc_freelist_pages(domain, freelist,
 			    VM_FREEPOOL_DIRECT, 0);
 		vm_domain_free_unlock(vmd);
 		if (m == NULL)
 			vm_domain_freecnt_inc(vmd, 1);
 	}
 	if (m == NULL) {
 		if (vm_domain_alloc_fail(vmd, NULL, req))
 			goto again;
 		return (NULL);
 	}
 
 found:
 	vm_page_dequeue(m);
 	vm_page_alloc_check(m);
 
 	/* Consumers should not rely on a useful default pindex value. */
 	m->pindex = 0xdeadc0dedeadc0de;
 	m->flags = (m->flags & PG_ZERO) | flags;
 	m->a.flags = 0;
 	m->oflags = VPO_UNMANAGED;
 	m->busy_lock = VPB_UNBUSIED;
 	if ((req & VM_ALLOC_WIRED) != 0) {
 		vm_wire_add(1);
 		m->ref_count = 1;
 	}
 
 	if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	return (m);
 }
 
 vm_page_t
 vm_page_alloc_freelist(int freelist, int req)
 {
 	struct vm_domainset_iter di;
 	vm_page_t m;
 	int domain;
 
 	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
 	do {
 		m = vm_page_alloc_freelist_domain(domain, freelist, req);
 		if (m != NULL)
 			break;
 	} while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
 
 	return (m);
 }
 
 vm_page_t
 vm_page_alloc_freelist_domain(int domain, int freelist, int req)
 {
 	KASSERT(freelist >= 0 && freelist < VM_NFREELIST,
 	    ("%s: invalid freelist %d", __func__, freelist));
 
 	return (_vm_page_alloc_noobj_domain(domain, freelist, req));
 }
 
 vm_page_t
 vm_page_alloc_noobj(int req)
 {
 	struct vm_domainset_iter di;
 	vm_page_t m;
 	int domain;
 
 	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
 	do {
 		m = vm_page_alloc_noobj_domain(domain, req);
 		if (m != NULL)
 			break;
 	} while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
 
 	return (m);
 }
 
 vm_page_t
 vm_page_alloc_noobj_domain(int domain, int req)
 {
 	return (_vm_page_alloc_noobj_domain(domain, VM_NFREELIST, req));
 }
 
 vm_page_t
 vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_memattr_t memattr)
 {
 	struct vm_domainset_iter di;
 	vm_page_t m;
 	int domain;
 
 	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
 	do {
 		m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low,
 		    high, alignment, boundary, memattr);
 		if (m != NULL)
 			break;
 	} while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
 
 	return (m);
 }
 
 vm_page_t
 vm_page_alloc_noobj_contig_domain(int domain, int req, u_long npages,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_memattr_t memattr)
 {
 	vm_page_t m;
 	u_long i;
 
 	KASSERT((req & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 	    VM_ALLOC_NOOBJ)) == 0,
 	    ("%s: invalid req %#x", __func__, req));
 
 	m = vm_page_alloc_contig_domain(NULL, 0, domain, req | VM_ALLOC_NOOBJ,
 	    npages, low, high, alignment, boundary, memattr);
 	if (m != NULL && (req & VM_ALLOC_ZERO) != 0) {
 		for (i = 0; i < npages; i++) {
 			if ((m[i].flags & PG_ZERO) == 0)
 				pmap_zero_page(&m[i]);
 		}
 	}
 	return (m);
 }
 
 /*
  * Check a page that has been freshly dequeued from a freelist.
  */
 static void
 vm_page_alloc_check(vm_page_t m)
 {
 
 	KASSERT(m->object == NULL, ("page %p has object", m));
 	KASSERT(m->a.queue == PQ_NONE &&
 	    (m->a.flags & PGA_QUEUE_STATE_MASK) == 0,
 	    ("page %p has unexpected queue %d, flags %#x",
 	    m, m->a.queue, (m->a.flags & PGA_QUEUE_STATE_MASK)));
 	KASSERT(m->ref_count == 0, ("page %p has references", m));
 	KASSERT(vm_page_busy_freed(m), ("page %p is not freed", m));
 	KASSERT(m->dirty == 0, ("page %p is dirty", m));
 	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
 	    ("page %p has unexpected memattr %d",
 	    m, pmap_page_get_memattr(m)));
 	KASSERT(m->valid == 0, ("free page %p is valid", m));
 	pmap_vm_page_alloc_check(m);
 }
 
 static int
 vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags)
 {
 	struct vm_domain *vmd;
 	struct vm_pgcache *pgcache;
 	int i;
 
 	pgcache = arg;
 	vmd = VM_DOMAIN(pgcache->domain);
 
 	/*
 	 * The page daemon should avoid creating extra memory pressure since its
 	 * main purpose is to replenish the store of free pages.
 	 */
 	if (vmd->vmd_severeset || curproc == pageproc ||
 	    !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt))
 		return (0);
 	domain = vmd->vmd_domain;
 	vm_domain_free_lock(vmd);
 	i = vm_phys_alloc_npages(domain, pgcache->pool, cnt,
 	    (vm_page_t *)store);
 	vm_domain_free_unlock(vmd);
 	if (cnt != i)
 		vm_domain_freecnt_inc(vmd, cnt - i);
 
 	return (i);
 }
 
 static void
 vm_page_zone_release(void *arg, void **store, int cnt)
 {
 	struct vm_domain *vmd;
 	struct vm_pgcache *pgcache;
 	vm_page_t m;
 	int i;
 
 	pgcache = arg;
 	vmd = VM_DOMAIN(pgcache->domain);
 	vm_domain_free_lock(vmd);
 	for (i = 0; i < cnt; i++) {
 		m = (vm_page_t)store[i];
 		vm_phys_free_pages(m, 0);
 	}
 	vm_domain_free_unlock(vmd);
 	vm_domain_freecnt_inc(vmd, cnt);
 }
 
 #define	VPSC_ANY	0	/* No restrictions. */
 #define	VPSC_NORESERV	1	/* Skip reservations; implies VPSC_NOSUPER. */
 #define	VPSC_NOSUPER	2	/* Skip superpages. */
 
 /*
  *	vm_page_scan_contig:
  *
  *	Scan vm_page_array[] between the specified entries "m_start" and
  *	"m_end" for a run of contiguous physical pages that satisfy the
  *	specified conditions, and return the lowest page in the run.  The
  *	specified "alignment" determines the alignment of the lowest physical
  *	page in the run.  If the specified "boundary" is non-zero, then the
  *	run of physical pages cannot span a physical address that is a
  *	multiple of "boundary".
  *
  *	"m_end" is never dereferenced, so it need not point to a vm_page
  *	structure within vm_page_array[].
  *
  *	"npages" must be greater than zero.  "m_start" and "m_end" must not
  *	span a hole (or discontiguity) in the physical address space.  Both
  *	"alignment" and "boundary" must be a power of two.
  */
 vm_page_t
 vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
     u_long alignment, vm_paddr_t boundary, int options)
 {
 	vm_object_t object;
 	vm_paddr_t pa;
 	vm_page_t m, m_run;
 #if VM_NRESERVLEVEL > 0
 	int level;
 #endif
 	int m_inc, order, run_ext, run_len;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	m_run = NULL;
 	run_len = 0;
 	for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
 		KASSERT((m->flags & PG_MARKER) == 0,
 		    ("page %p is PG_MARKER", m));
 		KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1,
 		    ("fictitious page %p has invalid ref count", m));
 
 		/*
 		 * If the current page would be the start of a run, check its
 		 * physical address against the end, alignment, and boundary
 		 * conditions.  If it doesn't satisfy these conditions, either
 		 * terminate the scan or advance to the next page that
 		 * satisfies the failed condition.
 		 */
 		if (run_len == 0) {
 			KASSERT(m_run == NULL, ("m_run != NULL"));
 			if (m + npages > m_end)
 				break;
 			pa = VM_PAGE_TO_PHYS(m);
 			if ((pa & (alignment - 1)) != 0) {
 				m_inc = atop(roundup2(pa, alignment) - pa);
 				continue;
 			}
 			if (rounddown2(pa ^ (pa + ptoa(npages) - 1),
 			    boundary) != 0) {
 				m_inc = atop(roundup2(pa, boundary) - pa);
 				continue;
 			}
 		} else
 			KASSERT(m_run != NULL, ("m_run == NULL"));
 
 retry:
 		m_inc = 1;
 		if (vm_page_wired(m))
 			run_ext = 0;
 #if VM_NRESERVLEVEL > 0
 		else if ((level = vm_reserv_level(m)) >= 0 &&
 		    (options & VPSC_NORESERV) != 0) {
 			run_ext = 0;
 			/* Advance to the end of the reservation. */
 			pa = VM_PAGE_TO_PHYS(m);
 			m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
 			    pa);
 		}
 #endif
 		else if ((object = atomic_load_ptr(&m->object)) != NULL) {
 			/*
 			 * The page is considered eligible for relocation if
 			 * and only if it could be laundered or reclaimed by
 			 * the page daemon.
 			 */
 			VM_OBJECT_RLOCK(object);
 			if (object != m->object) {
 				VM_OBJECT_RUNLOCK(object);
 				goto retry;
 			}
 			/* Don't care: PG_NODUMP, PG_ZERO. */
 			if (object->type != OBJT_DEFAULT &&
 			    (object->flags & OBJ_SWAP) == 0 &&
 			    object->type != OBJT_VNODE) {
 				run_ext = 0;
 #if VM_NRESERVLEVEL > 0
 			} else if ((options & VPSC_NOSUPER) != 0 &&
 			    (level = vm_reserv_level_iffullpop(m)) >= 0) {
 				run_ext = 0;
 				/* Advance to the end of the superpage. */
 				pa = VM_PAGE_TO_PHYS(m);
 				m_inc = atop(roundup2(pa + 1,
 				    vm_reserv_size(level)) - pa);
 #endif
 			} else if (object->memattr == VM_MEMATTR_DEFAULT &&
 			    vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) {
 				/*
 				 * The page is allocated but eligible for
 				 * relocation.  Extend the current run by one
 				 * page.
 				 */
 				KASSERT(pmap_page_get_memattr(m) ==
 				    VM_MEMATTR_DEFAULT,
 				    ("page %p has an unexpected memattr", m));
 				KASSERT((m->oflags & (VPO_SWAPINPROG |
 				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
 				    ("page %p has unexpected oflags", m));
 				/* Don't care: PGA_NOSYNC. */
 				run_ext = 1;
 			} else
 				run_ext = 0;
 			VM_OBJECT_RUNLOCK(object);
 #if VM_NRESERVLEVEL > 0
 		} else if (level >= 0) {
 			/*
 			 * The page is reserved but not yet allocated.  In
 			 * other words, it is still free.  Extend the current
 			 * run by one page.
 			 */
 			run_ext = 1;
 #endif
 		} else if ((order = m->order) < VM_NFREEORDER) {
 			/*
 			 * The page is enqueued in the physical memory
 			 * allocator's free page queues.  Moreover, it is the
 			 * first page in a power-of-two-sized run of
 			 * contiguous free pages.  Add these pages to the end
 			 * of the current run, and jump ahead.
 			 */
 			run_ext = 1 << order;
 			m_inc = 1 << order;
 		} else {
 			/*
 			 * Skip the page for one of the following reasons: (1)
 			 * It is enqueued in the physical memory allocator's
 			 * free page queues.  However, it is not the first
 			 * page in a run of contiguous free pages.  (This case
 			 * rarely occurs because the scan is performed in
 			 * ascending order.) (2) It is not reserved, and it is
 			 * transitioning from free to allocated.  (Conversely,
 			 * the transition from allocated to free for managed
 			 * pages is blocked by the page busy lock.) (3) It is
 			 * allocated but not contained by an object and not
 			 * wired, e.g., allocated by Xen's balloon driver.
 			 */
 			run_ext = 0;
 		}
 
 		/*
 		 * Extend or reset the current run of pages.
 		 */
 		if (run_ext > 0) {
 			if (run_len == 0)
 				m_run = m;
 			run_len += run_ext;
 		} else {
 			if (run_len > 0) {
 				m_run = NULL;
 				run_len = 0;
 			}
 		}
 	}
 	if (run_len >= npages)
 		return (m_run);
 	return (NULL);
 }
 
 /*
  *	vm_page_reclaim_run:
  *
  *	Try to relocate each of the allocated virtual pages within the
  *	specified run of physical pages to a new physical address.  Free the
  *	physical pages underlying the relocated virtual pages.  A virtual page
  *	is relocatable if and only if it could be laundered or reclaimed by
  *	the page daemon.  Whenever possible, a virtual page is relocated to a
  *	physical address above "high".
  *
  *	Returns 0 if every physical page within the run was already free or
  *	just freed by a successful relocation.  Otherwise, returns a non-zero
  *	value indicating why the last attempt to relocate a virtual page was
  *	unsuccessful.
  *
  *	"req_class" must be an allocation class.
  */
 static int
 vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
     vm_paddr_t high)
 {
 	struct vm_domain *vmd;
 	struct spglist free;
 	vm_object_t object;
 	vm_paddr_t pa;
 	vm_page_t m, m_end, m_new;
 	int error, order, req;
 
 	KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
 	    ("req_class is not an allocation class"));
 	SLIST_INIT(&free);
 	error = 0;
 	m = m_run;
 	m_end = m_run + npages;
 	for (; error == 0 && m < m_end; m++) {
 		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
 		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
 
 		/*
 		 * Racily check for wirings.  Races are handled once the object
 		 * lock is held and the page is unmapped.
 		 */
 		if (vm_page_wired(m))
 			error = EBUSY;
 		else if ((object = atomic_load_ptr(&m->object)) != NULL) {
 			/*
 			 * The page is relocated if and only if it could be
 			 * laundered or reclaimed by the page daemon.
 			 */
 			VM_OBJECT_WLOCK(object);
 			/* Don't care: PG_NODUMP, PG_ZERO. */
 			if (m->object != object ||
 			    (object->type != OBJT_DEFAULT &&
 			    (object->flags & OBJ_SWAP) == 0 &&
 			    object->type != OBJT_VNODE))
 				error = EINVAL;
 			else if (object->memattr != VM_MEMATTR_DEFAULT)
 				error = EINVAL;
 			else if (vm_page_queue(m) != PQ_NONE &&
 			    vm_page_tryxbusy(m) != 0) {
 				if (vm_page_wired(m)) {
 					vm_page_xunbusy(m);
 					error = EBUSY;
 					goto unlock;
 				}
 				KASSERT(pmap_page_get_memattr(m) ==
 				    VM_MEMATTR_DEFAULT,
 				    ("page %p has an unexpected memattr", m));
 				KASSERT(m->oflags == 0,
 				    ("page %p has unexpected oflags", m));
 				/* Don't care: PGA_NOSYNC. */
 				if (!vm_page_none_valid(m)) {
 					/*
 					 * First, try to allocate a new page
 					 * that is above "high".  Failing
 					 * that, try to allocate a new page
 					 * that is below "m_run".  Allocate
 					 * the new page between the end of
 					 * "m_run" and "high" only as a last
 					 * resort.
 					 */
-					req = req_class | VM_ALLOC_NOOBJ;
+					req = req_class;
 					if ((m->flags & PG_NODUMP) != 0)
 						req |= VM_ALLOC_NODUMP;
 					if (trunc_page(high) !=
 					    ~(vm_paddr_t)PAGE_MASK) {
-						m_new = vm_page_alloc_contig(
-						    NULL, 0, req, 1,
-						    round_page(high),
-						    ~(vm_paddr_t)0,
-						    PAGE_SIZE, 0,
-						    VM_MEMATTR_DEFAULT);
+						m_new =
+						    vm_page_alloc_noobj_contig(
+						    req, 1, round_page(high),
+						    ~(vm_paddr_t)0, PAGE_SIZE,
+						    0, VM_MEMATTR_DEFAULT);
 					} else
 						m_new = NULL;
 					if (m_new == NULL) {
 						pa = VM_PAGE_TO_PHYS(m_run);
-						m_new = vm_page_alloc_contig(
-						    NULL, 0, req, 1,
-						    0, pa - 1, PAGE_SIZE, 0,
+						m_new =
+						    vm_page_alloc_noobj_contig(
+						    req, 1, 0, pa - 1,
+						    PAGE_SIZE, 0,
 						    VM_MEMATTR_DEFAULT);
 					}
 					if (m_new == NULL) {
 						pa += ptoa(npages);
-						m_new = vm_page_alloc_contig(
-						    NULL, 0, req, 1,
-						    pa, high, PAGE_SIZE, 0,
-						    VM_MEMATTR_DEFAULT);
+						m_new =
+						    vm_page_alloc_noobj_contig(
+						    req, 1, pa, high, PAGE_SIZE,
+						    0, VM_MEMATTR_DEFAULT);
 					}
 					if (m_new == NULL) {
 						vm_page_xunbusy(m);
 						error = ENOMEM;
 						goto unlock;
 					}
 
 					/*
 					 * Unmap the page and check for new
 					 * wirings that may have been acquired
 					 * through a pmap lookup.
 					 */
 					if (object->ref_count != 0 &&
 					    !vm_page_try_remove_all(m)) {
 						vm_page_xunbusy(m);
 						vm_page_free(m_new);
 						error = EBUSY;
 						goto unlock;
 					}
 
 					/*
 					 * Replace "m" with the new page.  For
 					 * vm_page_replace(), "m" must be busy
 					 * and dequeued.  Finally, change "m"
 					 * as if vm_page_free() was called.
 					 */
 					m_new->a.flags = m->a.flags &
 					    ~PGA_QUEUE_STATE_MASK;
 					KASSERT(m_new->oflags == VPO_UNMANAGED,
 					    ("page %p is managed", m_new));
 					m_new->oflags = 0;
 					pmap_copy_page(m, m_new);
 					m_new->valid = m->valid;
 					m_new->dirty = m->dirty;
 					m->flags &= ~PG_ZERO;
 					vm_page_dequeue(m);
 					if (vm_page_replace_hold(m_new, object,
 					    m->pindex, m) &&
 					    vm_page_free_prep(m))
 						SLIST_INSERT_HEAD(&free, m,
 						    plinks.s.ss);
 
 					/*
 					 * The new page must be deactivated
 					 * before the object is unlocked.
 					 */
 					vm_page_deactivate(m_new);
 				} else {
 					m->flags &= ~PG_ZERO;
 					vm_page_dequeue(m);
 					if (vm_page_free_prep(m))
 						SLIST_INSERT_HEAD(&free, m,
 						    plinks.s.ss);
 					KASSERT(m->dirty == 0,
 					    ("page %p is dirty", m));
 				}
 			} else
 				error = EBUSY;
 unlock:
 			VM_OBJECT_WUNLOCK(object);
 		} else {
 			MPASS(vm_page_domain(m) == domain);
 			vmd = VM_DOMAIN(domain);
 			vm_domain_free_lock(vmd);
 			order = m->order;
 			if (order < VM_NFREEORDER) {
 				/*
 				 * The page is enqueued in the physical memory
 				 * allocator's free page queues.  Moreover, it
 				 * is the first page in a power-of-two-sized
 				 * run of contiguous free pages.  Jump ahead
 				 * to the last page within that run, and
 				 * continue from there.
 				 */
 				m += (1 << order) - 1;
 			}
 #if VM_NRESERVLEVEL > 0
 			else if (vm_reserv_is_page_free(m))
 				order = 0;
 #endif
 			vm_domain_free_unlock(vmd);
 			if (order == VM_NFREEORDER)
 				error = EINVAL;
 		}
 	}
 	if ((m = SLIST_FIRST(&free)) != NULL) {
 		int cnt;
 
 		vmd = VM_DOMAIN(domain);
 		cnt = 0;
 		vm_domain_free_lock(vmd);
 		do {
 			MPASS(vm_page_domain(m) == domain);
 			SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 			vm_phys_free_pages(m, 0);
 			cnt++;
 		} while ((m = SLIST_FIRST(&free)) != NULL);
 		vm_domain_free_unlock(vmd);
 		vm_domain_freecnt_inc(vmd, cnt);
 	}
 	return (error);
 }
 
 #define	NRUNS	16
 
 CTASSERT(powerof2(NRUNS));
 
 #define	RUN_INDEX(count)	((count) & (NRUNS - 1))
 
 #define	MIN_RECLAIM	8
 
 /*
  *	vm_page_reclaim_contig:
  *
  *	Reclaim allocated, contiguous physical memory satisfying the specified
  *	conditions by relocating the virtual pages using that physical memory.
  *	Returns true if reclamation is successful and false otherwise.  Since
  *	relocation requires the allocation of physical pages, reclamation may
  *	fail due to a shortage of free pages.  When reclamation fails, callers
  *	are expected to perform vm_wait() before retrying a failed allocation
  *	operation, e.g., vm_page_alloc_contig().
  *
  *	The caller must always specify an allocation class through "req".
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	The optional allocation flags are ignored.
  *
  *	"npages" must be greater than zero.  Both "alignment" and "boundary"
  *	must be a power of two.
  */
 bool
 vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 {
 	struct vm_domain *vmd;
 	vm_paddr_t curr_low;
 	vm_page_t m_run, m_runs[NRUNS];
 	u_long count, minalign, reclaimed;
 	int error, i, options, req_class;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 
 	/*
 	 * The caller will attempt an allocation after some runs have been
 	 * reclaimed and added to the vm_phys buddy lists.  Due to limitations
 	 * of vm_phys_alloc_contig(), round up the requested length to the next
 	 * power of two or maximum chunk size, and ensure that each run is
 	 * suitably aligned.
 	 */
 	minalign = 1ul << imin(flsl(npages - 1), VM_NFREEORDER - 1);
 	npages = roundup2(npages, minalign);
 	if (alignment < ptoa(minalign))
 		alignment = ptoa(minalign);
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	req_class = req & VM_ALLOC_CLASS_MASK;
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	/*
 	 * Return if the number of free pages cannot satisfy the requested
 	 * allocation.
 	 */
 	vmd = VM_DOMAIN(domain);
 	count = vmd->vmd_free_count;
 	if (count < npages + vmd->vmd_free_reserved || (count < npages +
 	    vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
 	    (count < npages && req_class == VM_ALLOC_INTERRUPT))
 		return (false);
 
 	/*
 	 * Scan up to three times, relaxing the restrictions ("options") on
 	 * the reclamation of reservations and superpages each time.
 	 */
 	for (options = VPSC_NORESERV;;) {
 		/*
 		 * Find the highest runs that satisfy the given constraints
 		 * and restrictions, and record them in "m_runs".
 		 */
 		curr_low = low;
 		count = 0;
 		for (;;) {
 			m_run = vm_phys_scan_contig(domain, npages, curr_low,
 			    high, alignment, boundary, options);
 			if (m_run == NULL)
 				break;
 			curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
 			m_runs[RUN_INDEX(count)] = m_run;
 			count++;
 		}
 
 		/*
 		 * Reclaim the highest runs in LIFO (descending) order until
 		 * the number of reclaimed pages, "reclaimed", is at least
 		 * MIN_RECLAIM.  Reset "reclaimed" each time because each
 		 * reclamation is idempotent, and runs will (likely) recur
 		 * from one scan to the next as restrictions are relaxed.
 		 */
 		reclaimed = 0;
 		for (i = 0; count > 0 && i < NRUNS; i++) {
 			count--;
 			m_run = m_runs[RUN_INDEX(count)];
 			error = vm_page_reclaim_run(req_class, domain, npages,
 			    m_run, high);
 			if (error == 0) {
 				reclaimed += npages;
 				if (reclaimed >= MIN_RECLAIM)
 					return (true);
 			}
 		}
 
 		/*
 		 * Either relax the restrictions on the next scan or return if
 		 * the last scan had no restrictions.
 		 */
 		if (options == VPSC_NORESERV)
 			options = VPSC_NOSUPER;
 		else if (options == VPSC_NOSUPER)
 			options = VPSC_ANY;
 		else if (options == VPSC_ANY)
 			return (reclaimed != 0);
 	}
 }
 
 bool
 vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
 	struct vm_domainset_iter di;
 	int domain;
 	bool ret;
 
 	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
 	do {
 		ret = vm_page_reclaim_contig_domain(domain, req, npages, low,
 		    high, alignment, boundary);
 		if (ret)
 			break;
 	} while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
 
 	return (ret);
 }
 
 /*
  * Set the domain in the appropriate page level domainset.
  */
 void
 vm_domain_set(struct vm_domain *vmd)
 {
 
 	mtx_lock(&vm_domainset_lock);
 	if (!vmd->vmd_minset && vm_paging_min(vmd)) {
 		vmd->vmd_minset = 1;
 		DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains);
 	}
 	if (!vmd->vmd_severeset && vm_paging_severe(vmd)) {
 		vmd->vmd_severeset = 1;
 		DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains);
 	}
 	mtx_unlock(&vm_domainset_lock);
 }
 
 /*
  * Clear the domain from the appropriate page level domainset.
  */
 void
 vm_domain_clear(struct vm_domain *vmd)
 {
 
 	mtx_lock(&vm_domainset_lock);
 	if (vmd->vmd_minset && !vm_paging_min(vmd)) {
 		vmd->vmd_minset = 0;
 		DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains);
 		if (vm_min_waiters != 0) {
 			vm_min_waiters = 0;
 			wakeup(&vm_min_domains);
 		}
 	}
 	if (vmd->vmd_severeset && !vm_paging_severe(vmd)) {
 		vmd->vmd_severeset = 0;
 		DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains);
 		if (vm_severe_waiters != 0) {
 			vm_severe_waiters = 0;
 			wakeup(&vm_severe_domains);
 		}
 	}
 
 	/*
 	 * If pageout daemon needs pages, then tell it that there are
 	 * some free.
 	 */
 	if (vmd->vmd_pageout_pages_needed &&
 	    vmd->vmd_free_count >= vmd->vmd_pageout_free_min) {
 		wakeup(&vmd->vmd_pageout_pages_needed);
 		vmd->vmd_pageout_pages_needed = 0;
 	}
 
 	/* See comments in vm_wait_doms(). */
 	if (vm_pageproc_waiters) {
 		vm_pageproc_waiters = 0;
 		wakeup(&vm_pageproc_waiters);
 	}
 	mtx_unlock(&vm_domainset_lock);
 }
 
 /*
  * Wait for free pages to exceed the min threshold globally.
  */
 void
 vm_wait_min(void)
 {
 
 	mtx_lock(&vm_domainset_lock);
 	while (vm_page_count_min()) {
 		vm_min_waiters++;
 		msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0);
 	}
 	mtx_unlock(&vm_domainset_lock);
 }
 
 /*
  * Wait for free pages to exceed the severe threshold globally.
  */
 void
 vm_wait_severe(void)
 {
 
 	mtx_lock(&vm_domainset_lock);
 	while (vm_page_count_severe()) {
 		vm_severe_waiters++;
 		msleep(&vm_severe_domains, &vm_domainset_lock, PVM,
 		    "vmwait", 0);
 	}
 	mtx_unlock(&vm_domainset_lock);
 }
 
 u_int
 vm_wait_count(void)
 {
 
 	return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters);
 }
 
 int
 vm_wait_doms(const domainset_t *wdoms, int mflags)
 {
 	int error;
 
 	error = 0;
 
 	/*
 	 * We use racey wakeup synchronization to avoid expensive global
 	 * locking for the pageproc when sleeping with a non-specific vm_wait.
 	 * To handle this, we only sleep for one tick in this instance.  It
 	 * is expected that most allocations for the pageproc will come from
 	 * kmem or vm_page_grab* which will use the more specific and
 	 * race-free vm_wait_domain().
 	 */
 	if (curproc == pageproc) {
 		mtx_lock(&vm_domainset_lock);
 		vm_pageproc_waiters++;
 		error = msleep(&vm_pageproc_waiters, &vm_domainset_lock,
 		    PVM | PDROP | mflags, "pageprocwait", 1);
 	} else {
 		/*
 		 * XXX Ideally we would wait only until the allocation could
 		 * be satisfied.  This condition can cause new allocators to
 		 * consume all freed pages while old allocators wait.
 		 */
 		mtx_lock(&vm_domainset_lock);
 		if (vm_page_count_min_set(wdoms)) {
 			vm_min_waiters++;
 			error = msleep(&vm_min_domains, &vm_domainset_lock,
 			    PVM | PDROP | mflags, "vmwait", 0);
 		} else
 			mtx_unlock(&vm_domainset_lock);
 	}
 	return (error);
 }
 
 /*
  *	vm_wait_domain:
  *
  *	Sleep until free pages are available for allocation.
  *	- Called in various places after failed memory allocations.
  */
 void
 vm_wait_domain(int domain)
 {
 	struct vm_domain *vmd;
 	domainset_t wdom;
 
 	vmd = VM_DOMAIN(domain);
 	vm_domain_free_assert_unlocked(vmd);
 
 	if (curproc == pageproc) {
 		mtx_lock(&vm_domainset_lock);
 		if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) {
 			vmd->vmd_pageout_pages_needed = 1;
 			msleep(&vmd->vmd_pageout_pages_needed,
 			    &vm_domainset_lock, PDROP | PSWP, "VMWait", 0);
 		} else
 			mtx_unlock(&vm_domainset_lock);
 	} else {
 		if (pageproc == NULL)
 			panic("vm_wait in early boot");
 		DOMAINSET_ZERO(&wdom);
 		DOMAINSET_SET(vmd->vmd_domain, &wdom);
 		vm_wait_doms(&wdom, 0);
 	}
 }
 
 static int
 vm_wait_flags(vm_object_t obj, int mflags)
 {
 	struct domainset *d;
 
 	d = NULL;
 
 	/*
 	 * Carefully fetch pointers only once: the struct domainset
 	 * itself is ummutable but the pointer might change.
 	 */
 	if (obj != NULL)
 		d = obj->domain.dr_policy;
 	if (d == NULL)
 		d = curthread->td_domain.dr_policy;
 
 	return (vm_wait_doms(&d->ds_mask, mflags));
 }
 
 /*
  *	vm_wait:
  *
  *	Sleep until free pages are available for allocation in the
  *	affinity domains of the obj.  If obj is NULL, the domain set
  *	for the calling thread is used.
  *	Called in various places after failed memory allocations.
  */
 void
 vm_wait(vm_object_t obj)
 {
 	(void)vm_wait_flags(obj, 0);
 }
 
 int
 vm_wait_intr(vm_object_t obj)
 {
 	return (vm_wait_flags(obj, PCATCH));
 }
 
 /*
  *	vm_domain_alloc_fail:
  *
  *	Called when a page allocation function fails.  Informs the
  *	pagedaemon and performs the requested wait.  Requires the
  *	domain_free and object lock on entry.  Returns with the
  *	object lock held and free lock released.  Returns an error when
  *	retry is necessary.
  *
  */
 static int
 vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req)
 {
 
 	vm_domain_free_assert_unlocked(vmd);
 
 	atomic_add_int(&vmd->vmd_pageout_deficit,
 	    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
 	if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) {
 		if (object != NULL) 
 			VM_OBJECT_WUNLOCK(object);
 		vm_wait_domain(vmd->vmd_domain);
 		if (object != NULL) 
 			VM_OBJECT_WLOCK(object);
 		if (req & VM_ALLOC_WAITOK)
 			return (EAGAIN);
 	}
 
 	return (0);
 }
 
 /*
  *	vm_waitpfault:
  *
  *	Sleep until free pages are available for allocation.
  *	- Called only in vm_fault so that processes page faulting
  *	  can be easily tracked.
  *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
  *	  processes will be able to grab memory first.  Do not change
  *	  this balance without careful testing first.
  */
 void
 vm_waitpfault(struct domainset *dset, int timo)
 {
 
 	/*
 	 * XXX Ideally we would wait only until the allocation could
 	 * be satisfied.  This condition can cause new allocators to
 	 * consume all freed pages while old allocators wait.
 	 */
 	mtx_lock(&vm_domainset_lock);
 	if (vm_page_count_min_set(&dset->ds_mask)) {
 		vm_min_waiters++;
 		msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP,
 		    "pfault", timo);
 	} else
 		mtx_unlock(&vm_domainset_lock);
 }
 
 static struct vm_pagequeue *
 _vm_page_pagequeue(vm_page_t m, uint8_t queue)
 {
 
 	return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]);
 }
 
 #ifdef INVARIANTS
 static struct vm_pagequeue *
 vm_page_pagequeue(vm_page_t m)
 {
 
 	return (_vm_page_pagequeue(m, vm_page_astate_load(m).queue));
 }
 #endif
 
 static __always_inline bool
 vm_page_pqstate_fcmpset(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new)
 {
 	vm_page_astate_t tmp;
 
 	tmp = *old;
 	do {
 		if (__predict_true(vm_page_astate_fcmpset(m, old, new)))
 			return (true);
 		counter_u64_add(pqstate_commit_retries, 1);
 	} while (old->_bits == tmp._bits);
 
 	return (false);
 }
 
 /*
  * Do the work of committing a queue state update that moves the page out of
  * its current queue.
  */
 static bool
 _vm_page_pqstate_commit_dequeue(struct vm_pagequeue *pq, vm_page_t m,
     vm_page_astate_t *old, vm_page_astate_t new)
 {
 	vm_page_t next;
 
 	vm_pagequeue_assert_locked(pq);
 	KASSERT(vm_page_pagequeue(m) == pq,
 	    ("%s: queue %p does not match page %p", __func__, pq, m));
 	KASSERT(old->queue != PQ_NONE && new.queue != old->queue,
 	    ("%s: invalid queue indices %d %d",
 	    __func__, old->queue, new.queue));
 
 	/*
 	 * Once the queue index of the page changes there is nothing
 	 * synchronizing with further updates to the page's physical
 	 * queue state.  Therefore we must speculatively remove the page
 	 * from the queue now and be prepared to roll back if the queue
 	 * state update fails.  If the page is not physically enqueued then
 	 * we just update its queue index.
 	 */
 	if ((old->flags & PGA_ENQUEUED) != 0) {
 		new.flags &= ~PGA_ENQUEUED;
 		next = TAILQ_NEXT(m, plinks.q);
 		TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 		vm_pagequeue_cnt_dec(pq);
 		if (!vm_page_pqstate_fcmpset(m, old, new)) {
 			if (next == NULL)
 				TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 			else
 				TAILQ_INSERT_BEFORE(next, m, plinks.q);
 			vm_pagequeue_cnt_inc(pq);
 			return (false);
 		} else {
 			return (true);
 		}
 	} else {
 		return (vm_page_pqstate_fcmpset(m, old, new));
 	}
 }
 
 static bool
 vm_page_pqstate_commit_dequeue(vm_page_t m, vm_page_astate_t *old,
     vm_page_astate_t new)
 {
 	struct vm_pagequeue *pq;
 	vm_page_astate_t as;
 	bool ret;
 
 	pq = _vm_page_pagequeue(m, old->queue);
 
 	/*
 	 * The queue field and PGA_ENQUEUED flag are stable only so long as the
 	 * corresponding page queue lock is held.
 	 */
 	vm_pagequeue_lock(pq);
 	as = vm_page_astate_load(m);
 	if (__predict_false(as._bits != old->_bits)) {
 		*old = as;
 		ret = false;
 	} else {
 		ret = _vm_page_pqstate_commit_dequeue(pq, m, old, new);
 	}
 	vm_pagequeue_unlock(pq);
 	return (ret);
 }
 
 /*
  * Commit a queue state update that enqueues or requeues a page.
  */
 static bool
 _vm_page_pqstate_commit_requeue(struct vm_pagequeue *pq, vm_page_t m,
     vm_page_astate_t *old, vm_page_astate_t new)
 {
 	struct vm_domain *vmd;
 
 	vm_pagequeue_assert_locked(pq);
 	KASSERT(old->queue != PQ_NONE && new.queue == old->queue,
 	    ("%s: invalid queue indices %d %d",
 	    __func__, old->queue, new.queue));
 
 	new.flags |= PGA_ENQUEUED;
 	if (!vm_page_pqstate_fcmpset(m, old, new))
 		return (false);
 
 	if ((old->flags & PGA_ENQUEUED) != 0)
 		TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	else
 		vm_pagequeue_cnt_inc(pq);
 
 	/*
 	 * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE.  In particular, if
 	 * both flags are set in close succession, only PGA_REQUEUE_HEAD will be
 	 * applied, even if it was set first.
 	 */
 	if ((old->flags & PGA_REQUEUE_HEAD) != 0) {
 		vmd = vm_pagequeue_domain(m);
 		KASSERT(pq == &vmd->vmd_pagequeues[PQ_INACTIVE],
 		    ("%s: invalid page queue for page %p", __func__, m));
 		TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
 	} else {
 		TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 	}
 	return (true);
 }
 
 /*
  * Commit a queue state update that encodes a request for a deferred queue
  * operation.
  */
 static bool
 vm_page_pqstate_commit_request(vm_page_t m, vm_page_astate_t *old,
     vm_page_astate_t new)
 {
 
 	KASSERT(old->queue == new.queue || new.queue != PQ_NONE,
 	    ("%s: invalid state, queue %d flags %x",
 	    __func__, new.queue, new.flags));
 
 	if (old->_bits != new._bits &&
 	    !vm_page_pqstate_fcmpset(m, old, new))
 		return (false);
 	vm_page_pqbatch_submit(m, new.queue);
 	return (true);
 }
 
 /*
  * A generic queue state update function.  This handles more cases than the
  * specialized functions above.
  */
 bool
 vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new)
 {
 
 	if (old->_bits == new._bits)
 		return (true);
 
 	if (old->queue != PQ_NONE && new.queue != old->queue) {
 		if (!vm_page_pqstate_commit_dequeue(m, old, new))
 			return (false);
 		if (new.queue != PQ_NONE)
 			vm_page_pqbatch_submit(m, new.queue);
 	} else {
 		if (!vm_page_pqstate_fcmpset(m, old, new))
 			return (false);
 		if (new.queue != PQ_NONE &&
 		    ((new.flags & ~old->flags) & PGA_QUEUE_OP_MASK) != 0)
 			vm_page_pqbatch_submit(m, new.queue);
 	}
 	return (true);
 }
 
 /*
  * Apply deferred queue state updates to a page.
  */
 static inline void
 vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue)
 {
 	vm_page_astate_t new, old;
 
 	CRITICAL_ASSERT(curthread);
 	vm_pagequeue_assert_locked(pq);
 	KASSERT(queue < PQ_COUNT,
 	    ("%s: invalid queue index %d", __func__, queue));
 	KASSERT(pq == _vm_page_pagequeue(m, queue),
 	    ("%s: page %p does not belong to queue %p", __func__, m, pq));
 
 	for (old = vm_page_astate_load(m);;) {
 		if (__predict_false(old.queue != queue ||
 		    (old.flags & PGA_QUEUE_OP_MASK) == 0)) {
 			counter_u64_add(queue_nops, 1);
 			break;
 		}
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("%s: page %p is unmanaged", __func__, m));
 
 		new = old;
 		if ((old.flags & PGA_DEQUEUE) != 0) {
 			new.flags &= ~PGA_QUEUE_OP_MASK;
 			new.queue = PQ_NONE;
 			if (__predict_true(_vm_page_pqstate_commit_dequeue(pq,
 			    m, &old, new))) {
 				counter_u64_add(queue_ops, 1);
 				break;
 			}
 		} else {
 			new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD);
 			if (__predict_true(_vm_page_pqstate_commit_requeue(pq,
 			    m, &old, new))) {
 				counter_u64_add(queue_ops, 1);
 				break;
 			}
 		}
 	}
 }
 
 static void
 vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq,
     uint8_t queue)
 {
 	int i;
 
 	for (i = 0; i < bq->bq_cnt; i++)
 		vm_pqbatch_process_page(pq, bq->bq_pa[i], queue);
 	vm_batchqueue_init(bq);
 }
 
 /*
  *	vm_page_pqbatch_submit:		[ internal use only ]
  *
  *	Enqueue a page in the specified page queue's batched work queue.
  *	The caller must have encoded the requested operation in the page
  *	structure's a.flags field.
  */
 void
 vm_page_pqbatch_submit(vm_page_t m, uint8_t queue)
 {
 	struct vm_batchqueue *bq;
 	struct vm_pagequeue *pq;
 	int domain;
 
 	KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue));
 
 	domain = vm_page_domain(m);
 	critical_enter();
 	bq = DPCPU_PTR(pqbatch[domain][queue]);
 	if (vm_batchqueue_insert(bq, m)) {
 		critical_exit();
 		return;
 	}
 	critical_exit();
 
 	pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	critical_enter();
 	bq = DPCPU_PTR(pqbatch[domain][queue]);
 	vm_pqbatch_process(pq, bq, queue);
 	vm_pqbatch_process_page(pq, m, queue);
 	vm_pagequeue_unlock(pq);
 	critical_exit();
 }
 
 /*
  *	vm_page_pqbatch_drain:		[ internal use only ]
  *
  *	Force all per-CPU page queue batch queues to be drained.  This is
  *	intended for use in severe memory shortages, to ensure that pages
  *	do not remain stuck in the batch queues.
  */
 void
 vm_page_pqbatch_drain(void)
 {
 	struct thread *td;
 	struct vm_domain *vmd;
 	struct vm_pagequeue *pq;
 	int cpu, domain, queue;
 
 	td = curthread;
 	CPU_FOREACH(cpu) {
 		thread_lock(td);
 		sched_bind(td, cpu);
 		thread_unlock(td);
 
 		for (domain = 0; domain < vm_ndomains; domain++) {
 			vmd = VM_DOMAIN(domain);
 			for (queue = 0; queue < PQ_COUNT; queue++) {
 				pq = &vmd->vmd_pagequeues[queue];
 				vm_pagequeue_lock(pq);
 				critical_enter();
 				vm_pqbatch_process(pq,
 				    DPCPU_PTR(pqbatch[domain][queue]), queue);
 				critical_exit();
 				vm_pagequeue_unlock(pq);
 			}
 		}
 	}
 	thread_lock(td);
 	sched_unbind(td);
 	thread_unlock(td);
 }
 
 /*
  *	vm_page_dequeue_deferred:	[ internal use only ]
  *
  *	Request removal of the given page from its current page
  *	queue.  Physical removal from the queue may be deferred
  *	indefinitely.
  */
 void
 vm_page_dequeue_deferred(vm_page_t m)
 {
 	vm_page_astate_t new, old;
 
 	old = vm_page_astate_load(m);
 	do {
 		if (old.queue == PQ_NONE) {
 			KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0,
 			    ("%s: page %p has unexpected queue state",
 			    __func__, m));
 			break;
 		}
 		new = old;
 		new.flags |= PGA_DEQUEUE;
 	} while (!vm_page_pqstate_commit_request(m, &old, new));
 }
 
 /*
  *	vm_page_dequeue:
  *
  *	Remove the page from whichever page queue it's in, if any, before
  *	returning.
  */
 void
 vm_page_dequeue(vm_page_t m)
 {
 	vm_page_astate_t new, old;
 
 	old = vm_page_astate_load(m);
 	do {
 		if (old.queue == PQ_NONE) {
 			KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0,
 			    ("%s: page %p has unexpected queue state",
 			    __func__, m));
 			break;
 		}
 		new = old;
 		new.flags &= ~PGA_QUEUE_OP_MASK;
 		new.queue = PQ_NONE;
 	} while (!vm_page_pqstate_commit_dequeue(m, &old, new));
 
 }
 
 /*
  * Schedule the given page for insertion into the specified page queue.
  * Physical insertion of the page may be deferred indefinitely.
  */
 static void
 vm_page_enqueue(vm_page_t m, uint8_t queue)
 {
 
 	KASSERT(m->a.queue == PQ_NONE &&
 	    (m->a.flags & PGA_QUEUE_STATE_MASK) == 0,
 	    ("%s: page %p is already enqueued", __func__, m));
 	KASSERT(m->ref_count > 0,
 	    ("%s: page %p does not carry any references", __func__, m));
 
 	m->a.queue = queue;
 	if ((m->a.flags & PGA_REQUEUE) == 0)
 		vm_page_aflag_set(m, PGA_REQUEUE);
 	vm_page_pqbatch_submit(m, queue);
 }
 
 /*
  *	vm_page_free_prep:
  *
  *	Prepares the given page to be put on the free list,
  *	disassociating it from any VM object. The caller may return
  *	the page to the free list only if this function returns true.
  *
  *	The object, if it exists, must be locked, and then the page must
  *	be xbusy.  Otherwise the page must be not busied.  A managed
  *	page must be unmapped.
  */
 static bool
 vm_page_free_prep(vm_page_t m)
 {
 
 	/*
 	 * Synchronize with threads that have dropped a reference to this
 	 * page.
 	 */
 	atomic_thread_fence_acq();
 
 #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
 	if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) {
 		uint64_t *p;
 		int i;
 		p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 		for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++)
 			KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx",
 			    m, i, (uintmax_t)*p));
 	}
 #endif
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		KASSERT(!pmap_page_is_mapped(m),
 		    ("vm_page_free_prep: freeing mapped page %p", m));
 		KASSERT((m->a.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0,
 		    ("vm_page_free_prep: mapping flags set in page %p", m));
 	} else {
 		KASSERT(m->a.queue == PQ_NONE,
 		    ("vm_page_free_prep: unmanaged page %p is queued", m));
 	}
 	VM_CNT_INC(v_tfree);
 
 	if (m->object != NULL) {
 		KASSERT(((m->oflags & VPO_UNMANAGED) != 0) ==
 		    ((m->object->flags & OBJ_UNMANAGED) != 0),
 		    ("vm_page_free_prep: managed flag mismatch for page %p",
 		    m));
 		vm_page_assert_xbusied(m);
 
 		/*
 		 * The object reference can be released without an atomic
 		 * operation.
 		 */
 		KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 		    m->ref_count == VPRC_OBJREF,
 		    ("vm_page_free_prep: page %p has unexpected ref_count %u",
 		    m, m->ref_count));
 		vm_page_object_remove(m);
 		m->ref_count -= VPRC_OBJREF;
 	} else
 		vm_page_assert_unbusied(m);
 
 	vm_page_busy_free(m);
 
 	/*
 	 * If fictitious remove object association and
 	 * return.
 	 */
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		KASSERT(m->ref_count == 1,
 		    ("fictitious page %p is referenced", m));
 		KASSERT(m->a.queue == PQ_NONE,
 		    ("fictitious page %p is queued", m));
 		return (false);
 	}
 
 	/*
 	 * Pages need not be dequeued before they are returned to the physical
 	 * memory allocator, but they must at least be marked for a deferred
 	 * dequeue.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_dequeue_deferred(m);
 
 	m->valid = 0;
 	vm_page_undirty(m);
 
 	if (m->ref_count != 0)
 		panic("vm_page_free_prep: page %p has references", m);
 
 	/*
 	 * Restore the default memory attribute to the page.
 	 */
 	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
 		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
 
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Determine whether the page belongs to a reservation.  If the page was
 	 * allocated from a per-CPU cache, it cannot belong to a reservation, so
 	 * as an optimization, we avoid the check in that case.
 	 */
 	if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m))
 		return (false);
 #endif
 
 	return (true);
 }
 
 /*
  *	vm_page_free_toq:
  *
  *	Returns the given page to the free list, disassociating it
  *	from any VM object.
  *
  *	The object must be locked.  The page must be exclusively busied if it
  *	belongs to an object.
  */
 static void
 vm_page_free_toq(vm_page_t m)
 {
 	struct vm_domain *vmd;
 	uma_zone_t zone;
 
 	if (!vm_page_free_prep(m))
 		return;
 
 	vmd = vm_pagequeue_domain(m);
 	zone = vmd->vmd_pgcache[m->pool].zone;
 	if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) {
 		uma_zfree(zone, m);
 		return;
 	}
 	vm_domain_free_lock(vmd);
 	vm_phys_free_pages(m, 0);
 	vm_domain_free_unlock(vmd);
 	vm_domain_freecnt_inc(vmd, 1);
 }
 
 /*
  *	vm_page_free_pages_toq:
  *
  *	Returns a list of pages to the free list, disassociating it
  *	from any VM object.  In other words, this is equivalent to
  *	calling vm_page_free_toq() for each page of a list of VM objects.
  */
 void
 vm_page_free_pages_toq(struct spglist *free, bool update_wire_count)
 {
 	vm_page_t m;
 	int count;
 
 	if (SLIST_EMPTY(free))
 		return;
 
 	count = 0;
 	while ((m = SLIST_FIRST(free)) != NULL) {
 		count++;
 		SLIST_REMOVE_HEAD(free, plinks.s.ss);
 		vm_page_free_toq(m);
 	}
 
 	if (update_wire_count)
 		vm_wire_sub(count);
 }
 
 /*
  * Mark this page as wired down.  For managed pages, this prevents reclamation
  * by the page daemon, or when the containing object, if any, is destroyed.
  */
 void
 vm_page_wire(vm_page_t m)
 {
 	u_int old;
 
 #ifdef INVARIANTS
 	if (m->object != NULL && !vm_page_busied(m) &&
 	    !vm_object_busied(m->object))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 #endif
 	KASSERT((m->flags & PG_FICTITIOUS) == 0 ||
 	    VPRC_WIRE_COUNT(m->ref_count) >= 1,
 	    ("vm_page_wire: fictitious page %p has zero wirings", m));
 
 	old = atomic_fetchadd_int(&m->ref_count, 1);
 	KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX,
 	    ("vm_page_wire: counter overflow for page %p", m));
 	if (VPRC_WIRE_COUNT(old) == 0) {
 		if ((m->oflags & VPO_UNMANAGED) == 0)
 			vm_page_aflag_set(m, PGA_DEQUEUE);
 		vm_wire_add(1);
 	}
 }
 
 /*
  * Attempt to wire a mapped page following a pmap lookup of that page.
  * This may fail if a thread is concurrently tearing down mappings of the page.
  * The transient failure is acceptable because it translates to the
  * failure of the caller pmap_extract_and_hold(), which should be then
  * followed by the vm_fault() fallback, see e.g. vm_fault_quick_hold_pages().
  */
 bool
 vm_page_wire_mapped(vm_page_t m)
 {
 	u_int old;
 
 	old = m->ref_count;
 	do {
 		KASSERT(old > 0,
 		    ("vm_page_wire_mapped: wiring unreferenced page %p", m));
 		if ((old & VPRC_BLOCKED) != 0)
 			return (false);
 	} while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1));
 
 	if (VPRC_WIRE_COUNT(old) == 0) {
 		if ((m->oflags & VPO_UNMANAGED) == 0)
 			vm_page_aflag_set(m, PGA_DEQUEUE);
 		vm_wire_add(1);
 	}
 	return (true);
 }
 
 /*
  * Release a wiring reference to a managed page.  If the page still belongs to
  * an object, update its position in the page queues to reflect the reference.
  * If the wiring was the last reference to the page, free the page.
  */
 static void
 vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse)
 {
 	u_int old;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("%s: page %p is unmanaged", __func__, m));
 
 	/*
 	 * Update LRU state before releasing the wiring reference.
 	 * Use a release store when updating the reference count to
 	 * synchronize with vm_page_free_prep().
 	 */
 	old = m->ref_count;
 	do {
 		KASSERT(VPRC_WIRE_COUNT(old) > 0,
 		    ("vm_page_unwire: wire count underflow for page %p", m));
 
 		if (old > VPRC_OBJREF + 1) {
 			/*
 			 * The page has at least one other wiring reference.  An
 			 * earlier iteration of this loop may have called
 			 * vm_page_release_toq() and cleared PGA_DEQUEUE, so
 			 * re-set it if necessary.
 			 */
 			if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0)
 				vm_page_aflag_set(m, PGA_DEQUEUE);
 		} else if (old == VPRC_OBJREF + 1) {
 			/*
 			 * This is the last wiring.  Clear PGA_DEQUEUE and
 			 * update the page's queue state to reflect the
 			 * reference.  If the page does not belong to an object
 			 * (i.e., the VPRC_OBJREF bit is clear), we only need to
 			 * clear leftover queue state.
 			 */
 			vm_page_release_toq(m, nqueue, noreuse);
 		} else if (old == 1) {
 			vm_page_aflag_clear(m, PGA_DEQUEUE);
 		}
 	} while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1));
 
 	if (VPRC_WIRE_COUNT(old) == 1) {
 		vm_wire_sub(1);
 		if (old == 1)
 			vm_page_free(m);
 	}
 }
 
 /*
  * Release one wiring of the specified page, potentially allowing it to be
  * paged out.
  *
  * Only managed pages belonging to an object can be paged out.  If the number
  * of wirings transitions to zero and the page is eligible for page out, then
  * the page is added to the specified paging queue.  If the released wiring
  * represented the last reference to the page, the page is freed.
  */
 void
 vm_page_unwire(vm_page_t m, uint8_t nqueue)
 {
 
 	KASSERT(nqueue < PQ_COUNT,
 	    ("vm_page_unwire: invalid queue %u request for page %p",
 	    nqueue, m));
 
 	if ((m->oflags & VPO_UNMANAGED) != 0) {
 		if (vm_page_unwire_noq(m) && m->ref_count == 0)
 			vm_page_free(m);
 		return;
 	}
 	vm_page_unwire_managed(m, nqueue, false);
 }
 
 /*
  * Unwire a page without (re-)inserting it into a page queue.  It is up
  * to the caller to enqueue, requeue, or free the page as appropriate.
  * In most cases involving managed pages, vm_page_unwire() should be used
  * instead.
  */
 bool
 vm_page_unwire_noq(vm_page_t m)
 {
 	u_int old;
 
 	old = vm_page_drop(m, 1);
 	KASSERT(VPRC_WIRE_COUNT(old) != 0,
 	    ("%s: counter underflow for page %p", __func__,  m));
 	KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1,
 	    ("%s: missing ref on fictitious page %p", __func__, m));
 
 	if (VPRC_WIRE_COUNT(old) > 1)
 		return (false);
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_aflag_clear(m, PGA_DEQUEUE);
 	vm_wire_sub(1);
 	return (true);
 }
 
 /*
  * Ensure that the page ends up in the specified page queue.  If the page is
  * active or being moved to the active queue, ensure that its act_count is
  * at least ACT_INIT but do not otherwise mess with it.
  */
 static __always_inline void
 vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag)
 {
 	vm_page_astate_t old, new;
 
 	KASSERT(m->ref_count > 0,
 	    ("%s: page %p does not carry any references", __func__, m));
 	KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD,
 	    ("%s: invalid flags %x", __func__, nflag));
 
 	if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
 		return;
 
 	old = vm_page_astate_load(m);
 	do {
 		if ((old.flags & PGA_DEQUEUE) != 0)
 			break;
 		new = old;
 		new.flags &= ~PGA_QUEUE_OP_MASK;
 		if (nqueue == PQ_ACTIVE)
 			new.act_count = max(old.act_count, ACT_INIT);
 		if (old.queue == nqueue) {
 			if (nqueue != PQ_ACTIVE)
 				new.flags |= nflag;
 		} else {
 			new.flags |= nflag;
 			new.queue = nqueue;
 		}
 	} while (!vm_page_pqstate_commit(m, &old, new));
 }
 
 /*
  * Put the specified page on the active list (if appropriate).
  */
 void
 vm_page_activate(vm_page_t m)
 {
 
 	vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE);
 }
 
 /*
  * Move the specified page to the tail of the inactive queue, or requeue
  * the page if it is already in the inactive queue.
  */
 void
 vm_page_deactivate(vm_page_t m)
 {
 
 	vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE);
 }
 
 void
 vm_page_deactivate_noreuse(vm_page_t m)
 {
 
 	vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD);
 }
 
 /*
  * Put a page in the laundry, or requeue it if it is already there.
  */
 void
 vm_page_launder(vm_page_t m)
 {
 
 	vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE);
 }
 
 /*
  * Put a page in the PQ_UNSWAPPABLE holding queue.
  */
 void
 vm_page_unswappable(vm_page_t m)
 {
 
 	KASSERT(!vm_page_wired(m) && (m->oflags & VPO_UNMANAGED) == 0,
 	    ("page %p already unswappable", m));
 
 	vm_page_dequeue(m);
 	vm_page_enqueue(m, PQ_UNSWAPPABLE);
 }
 
 /*
  * Release a page back to the page queues in preparation for unwiring.
  */
 static void
 vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse)
 {
 	vm_page_astate_t old, new;
 	uint16_t nflag;
 
 	/*
 	 * Use a check of the valid bits to determine whether we should
 	 * accelerate reclamation of the page.  The object lock might not be
 	 * held here, in which case the check is racy.  At worst we will either
 	 * accelerate reclamation of a valid page and violate LRU, or
 	 * unnecessarily defer reclamation of an invalid page.
 	 *
 	 * If we were asked to not cache the page, place it near the head of the
 	 * inactive queue so that is reclaimed sooner.
 	 */
 	if (noreuse || m->valid == 0) {
 		nqueue = PQ_INACTIVE;
 		nflag = PGA_REQUEUE_HEAD;
 	} else {
 		nflag = PGA_REQUEUE;
 	}
 
 	old = vm_page_astate_load(m);
 	do {
 		new = old;
 
 		/*
 		 * If the page is already in the active queue and we are not
 		 * trying to accelerate reclamation, simply mark it as
 		 * referenced and avoid any queue operations.
 		 */
 		new.flags &= ~PGA_QUEUE_OP_MASK;
 		if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE)
 			new.flags |= PGA_REFERENCED;
 		else {
 			new.flags |= nflag;
 			new.queue = nqueue;
 		}
 	} while (!vm_page_pqstate_commit(m, &old, new));
 }
 
 /*
  * Unwire a page and either attempt to free it or re-add it to the page queues.
  */
 void
 vm_page_release(vm_page_t m, int flags)
 {
 	vm_object_t object;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("vm_page_release: page %p is unmanaged", m));
 
 	if ((flags & VPR_TRYFREE) != 0) {
 		for (;;) {
 			object = atomic_load_ptr(&m->object);
 			if (object == NULL)
 				break;
 			/* Depends on type-stability. */
 			if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object))
 				break;
 			if (object == m->object) {
 				vm_page_release_locked(m, flags);
 				VM_OBJECT_WUNLOCK(object);
 				return;
 			}
 			VM_OBJECT_WUNLOCK(object);
 		}
 	}
 	vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0);
 }
 
 /* See vm_page_release(). */
 void
 vm_page_release_locked(vm_page_t m, int flags)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("vm_page_release_locked: page %p is unmanaged", m));
 
 	if (vm_page_unwire_noq(m)) {
 		if ((flags & VPR_TRYFREE) != 0 &&
 		    (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) &&
 		    m->dirty == 0 && vm_page_tryxbusy(m)) {
 			/*
 			 * An unlocked lookup may have wired the page before the
 			 * busy lock was acquired, in which case the page must
 			 * not be freed.
 			 */
 			if (__predict_true(!vm_page_wired(m))) {
 				vm_page_free(m);
 				return;
 			}
 			vm_page_xunbusy(m);
 		} else {
 			vm_page_release_toq(m, PQ_INACTIVE, flags != 0);
 		}
 	}
 }
 
 static bool
 vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t))
 {
 	u_int old;
 
 	KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0,
 	    ("vm_page_try_blocked_op: page %p has no object", m));
 	KASSERT(vm_page_busied(m),
 	    ("vm_page_try_blocked_op: page %p is not busy", m));
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 
 	old = m->ref_count;
 	do {
 		KASSERT(old != 0,
 		    ("vm_page_try_blocked_op: page %p has no references", m));
 		if (VPRC_WIRE_COUNT(old) != 0)
 			return (false);
 	} while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED));
 
 	(op)(m);
 
 	/*
 	 * If the object is read-locked, new wirings may be created via an
 	 * object lookup.
 	 */
 	old = vm_page_drop(m, VPRC_BLOCKED);
 	KASSERT(!VM_OBJECT_WOWNED(m->object) ||
 	    old == (VPRC_BLOCKED | VPRC_OBJREF),
 	    ("vm_page_try_blocked_op: unexpected refcount value %u for %p",
 	    old, m));
 	return (true);
 }
 
 /*
  * Atomically check for wirings and remove all mappings of the page.
  */
 bool
 vm_page_try_remove_all(vm_page_t m)
 {
 
 	return (vm_page_try_blocked_op(m, pmap_remove_all));
 }
 
 /*
  * Atomically check for wirings and remove all writeable mappings of the page.
  */
 bool
 vm_page_try_remove_write(vm_page_t m)
 {
 
 	return (vm_page_try_blocked_op(m, pmap_remove_write));
 }
 
 /*
  * vm_page_advise
  *
  * 	Apply the specified advice to the given page.
  */
 void
 vm_page_advise(vm_page_t m, int advice)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	vm_page_assert_xbusied(m);
 
 	if (advice == MADV_FREE)
 		/*
 		 * Mark the page clean.  This will allow the page to be freed
 		 * without first paging it out.  MADV_FREE pages are often
 		 * quickly reused by malloc(3), so we do not do anything that
 		 * would result in a page fault on a later access.
 		 */
 		vm_page_undirty(m);
 	else if (advice != MADV_DONTNEED) {
 		if (advice == MADV_WILLNEED)
 			vm_page_activate(m);
 		return;
 	}
 
 	if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
 		vm_page_dirty(m);
 
 	/*
 	 * Clear any references to the page.  Otherwise, the page daemon will
 	 * immediately reactivate the page.
 	 */
 	vm_page_aflag_clear(m, PGA_REFERENCED);
 
 	/*
 	 * Place clean pages near the head of the inactive queue rather than
 	 * the tail, thus defeating the queue's LRU operation and ensuring that
 	 * the page will be reused quickly.  Dirty pages not already in the
 	 * laundry are moved there.
 	 */
 	if (m->dirty == 0)
 		vm_page_deactivate_noreuse(m);
 	else if (!vm_page_in_laundry(m))
 		vm_page_launder(m);
 }
 
 /*
  *	vm_page_grab_release
  *
  *	Helper routine for grab functions to release busy on return.
  */
 static inline void
 vm_page_grab_release(vm_page_t m, int allocflags)
 {
 
 	if ((allocflags & VM_ALLOC_NOBUSY) != 0) {
 		if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0)
 			vm_page_sunbusy(m);
 		else
 			vm_page_xunbusy(m);
 	}
 }
 
 /*
  *	vm_page_grab_sleep
  *
  *	Sleep for busy according to VM_ALLOC_ parameters.  Returns true
  *	if the caller should retry and false otherwise.
  *
  *	If the object is locked on entry the object will be unlocked with
  *	false returns and still locked but possibly having been dropped
  *	with true returns.
  */
 static bool
 vm_page_grab_sleep(vm_object_t object, vm_page_t m, vm_pindex_t pindex,
     const char *wmesg, int allocflags, bool locked)
 {
 
 	if ((allocflags & VM_ALLOC_NOWAIT) != 0)
 		return (false);
 
 	/*
 	 * Reference the page before unlocking and sleeping so that
 	 * the page daemon is less likely to reclaim it.
 	 */
 	if (locked && (allocflags & VM_ALLOC_NOCREAT) == 0)
 		vm_page_reference(m);
 
 	if (_vm_page_busy_sleep(object, m, pindex, wmesg, allocflags, locked) &&
 	    locked)
 		VM_OBJECT_WLOCK(object);
 	if ((allocflags & VM_ALLOC_WAITFAIL) != 0)
 		return (false);
 
 	return (true);
 }
 
 /*
  * Assert that the grab flags are valid.
  */
 static inline void
 vm_page_grab_check(int allocflags)
 {
 
 	KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
 	    (allocflags & VM_ALLOC_WIRED) != 0,
 	    ("vm_page_grab*: the pages must be busied or wired"));
 
 	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
 	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
 	    ("vm_page_grab*: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
 }
 
 /*
  * Calculate the page allocation flags for grab.
  */
 static inline int
 vm_page_grab_pflags(int allocflags)
 {
 	int pflags;
 
 	pflags = allocflags &
 	    ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL |
 	    VM_ALLOC_NOBUSY);
 	if ((allocflags & VM_ALLOC_NOWAIT) == 0)
 		pflags |= VM_ALLOC_WAITFAIL;
 	if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0)
 		pflags |= VM_ALLOC_SBUSY;
 
 	return (pflags);
 }
 
 /*
  * Grab a page, waiting until we are waken up due to the page
  * changing state.  We keep on waiting, if the page continues
  * to be in the object.  If the page doesn't exist, first allocate it
  * and then conditionally zero it.
  *
  * This routine may sleep.
  *
  * The object must be locked on entry.  The lock will, however, be released
  * and reacquired if the routine sleeps.
  */
 vm_page_t
 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_page_grab_check(allocflags);
 
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
 		if (!vm_page_tryacquire(m, allocflags)) {
 			if (vm_page_grab_sleep(object, m, pindex, "pgrbwt",
 			    allocflags, true))
 				goto retrylookup;
 			return (NULL);
 		}
 		goto out;
 	}
 	if ((allocflags & VM_ALLOC_NOCREAT) != 0)
 		return (NULL);
 	m = vm_page_alloc(object, pindex, vm_page_grab_pflags(allocflags));
 	if (m == NULL) {
 		if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0)
 			return (NULL);
 		goto retrylookup;
 	}
 	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 out:
 	vm_page_grab_release(m, allocflags);
 
 	return (m);
 }
 
 /*
  * Locklessly attempt to acquire a page given a (object, pindex) tuple
  * and an optional previous page to avoid the radix lookup.  The resulting
  * page will be validated against the identity tuple and busied or wired
  * as requested.  A NULL *mp return guarantees that the page was not in
  * radix at the time of the call but callers must perform higher level
  * synchronization or retry the operation under a lock if they require
  * an atomic answer.  This is the only lock free validation routine,
  * other routines can depend on the resulting page state.
  *
  * The return value indicates whether the operation failed due to caller
  * flags.  The return is tri-state with mp:
  *
  * (true, *mp != NULL) - The operation was successful.
  * (true, *mp == NULL) - The page was not found in tree.
  * (false, *mp == NULL) - WAITFAIL or NOWAIT prevented acquisition.
  */
 static bool
 vm_page_acquire_unlocked(vm_object_t object, vm_pindex_t pindex,
     vm_page_t prev, vm_page_t *mp, int allocflags)
 {
 	vm_page_t m;
 
 	vm_page_grab_check(allocflags);
 	MPASS(prev == NULL || vm_page_busied(prev) || vm_page_wired(prev));
 
 	*mp = NULL;
 	for (;;) {
 		/*
 		 * We may see a false NULL here because the previous page
 		 * has been removed or just inserted and the list is loaded
 		 * without barriers.  Switch to radix to verify.
 		 */
 		if (prev == NULL || (m = TAILQ_NEXT(prev, listq)) == NULL ||
 		    QMD_IS_TRASHED(m) || m->pindex != pindex ||
 		    atomic_load_ptr(&m->object) != object) {
 			prev = NULL;
 			/*
 			 * This guarantees the result is instantaneously
 			 * correct.
 			 */
 			m = vm_radix_lookup_unlocked(&object->rtree, pindex);
 		}
 		if (m == NULL)
 			return (true);
 		if (vm_page_trybusy(m, allocflags)) {
 			if (m->object == object && m->pindex == pindex)
 				break;
 			/* relookup. */
 			vm_page_busy_release(m);
 			cpu_spinwait();
 			continue;
 		}
 		if (!vm_page_grab_sleep(object, m, pindex, "pgnslp",
 		    allocflags, false))
 			return (false);
 	}
 	if ((allocflags & VM_ALLOC_WIRED) != 0)
 		vm_page_wire(m);
 	vm_page_grab_release(m, allocflags);
 	*mp = m;
 	return (true);
 }
 
 /*
  * Try to locklessly grab a page and fall back to the object lock if NOCREAT
  * is not set.
  */
 vm_page_t
 vm_page_grab_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags)
 {
 	vm_page_t m;
 
 	vm_page_grab_check(allocflags);
 
 	if (!vm_page_acquire_unlocked(object, pindex, NULL, &m, allocflags))
 		return (NULL);
 	if (m != NULL)
 		return (m);
 
 	/*
 	 * The radix lockless lookup should never return a false negative
 	 * errors.  If the user specifies NOCREAT they are guaranteed there
 	 * was no page present at the instant of the call.  A NOCREAT caller
 	 * must handle create races gracefully.
 	 */
 	if ((allocflags & VM_ALLOC_NOCREAT) != 0)
 		return (NULL);
 
 	VM_OBJECT_WLOCK(object);
 	m = vm_page_grab(object, pindex, allocflags);
 	VM_OBJECT_WUNLOCK(object);
 
 	return (m);
 }
 
 /*
  * Grab a page and make it valid, paging in if necessary.  Pages missing from
  * their pager are zero filled and validated.  If a VM_ALLOC_COUNT is supplied
  * and the page is not valid as many as VM_INITIAL_PAGEIN pages can be brought
  * in simultaneously.  Additional pages will be left on a paging queue but
  * will neither be wired nor busy regardless of allocflags.
  */
 int
 vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags)
 {
 	vm_page_t m;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	int after, i, pflags, rv;
 
 	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
 	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
 	    ("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
 	KASSERT((allocflags &
 	    (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0,
 	    ("vm_page_grab_valid: Invalid flags 0x%X", allocflags));
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY |
 	    VM_ALLOC_WIRED);
 	pflags |= VM_ALLOC_WAITFAIL;
 
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
 		/*
 		 * If the page is fully valid it can only become invalid
 		 * with the object lock held.  If it is not valid it can
 		 * become valid with the busy lock held.  Therefore, we
 		 * may unnecessarily lock the exclusive busy here if we
 		 * race with I/O completion not using the object lock.
 		 * However, we will not end up with an invalid page and a
 		 * shared lock.
 		 */
 		if (!vm_page_trybusy(m,
 		    vm_page_all_valid(m) ? allocflags : 0)) {
 			(void)vm_page_grab_sleep(object, m, pindex, "pgrbwt",
 			    allocflags, true);
 			goto retrylookup;
 		}
 		if (vm_page_all_valid(m))
 			goto out;
 		if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
 			vm_page_busy_release(m);
 			*mp = NULL;
 			return (VM_PAGER_FAIL);
 		}
 	} else if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
 		*mp = NULL;
 		return (VM_PAGER_FAIL);
 	} else if ((m = vm_page_alloc(object, pindex, pflags)) == NULL) {
 		goto retrylookup;
 	}
 
 	vm_page_assert_xbusied(m);
 	if (vm_pager_has_page(object, pindex, NULL, &after)) {
 		after = MIN(after, VM_INITIAL_PAGEIN);
 		after = MIN(after, allocflags >> VM_ALLOC_COUNT_SHIFT);
 		after = MAX(after, 1);
 		ma[0] = m;
 		for (i = 1; i < after; i++) {
 			if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
 				if (ma[i]->valid || !vm_page_tryxbusy(ma[i]))
 					break;
 			} else {
 				ma[i] = vm_page_alloc(object, m->pindex + i,
 				    VM_ALLOC_NORMAL);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		after = i;
 		vm_object_pip_add(object, after);
 		VM_OBJECT_WUNLOCK(object);
 		rv = vm_pager_get_pages(object, ma, after, NULL, NULL);
 		VM_OBJECT_WLOCK(object);
 		vm_object_pip_wakeupn(object, after);
 		/* Pager may have replaced a page. */
 		m = ma[0];
 		if (rv != VM_PAGER_OK) {
 			for (i = 0; i < after; i++) {
 				if (!vm_page_wired(ma[i]))
 					vm_page_free(ma[i]);
 				else
 					vm_page_xunbusy(ma[i]);
 			}
 			*mp = NULL;
 			return (rv);
 		}
 		for (i = 1; i < after; i++)
 			vm_page_readahead_finish(ma[i]);
 		MPASS(vm_page_all_valid(m));
 	} else {
 		vm_page_zero_invalid(m, TRUE);
 	}
 out:
 	if ((allocflags & VM_ALLOC_WIRED) != 0)
 		vm_page_wire(m);
 	if ((allocflags & VM_ALLOC_SBUSY) != 0 && vm_page_xbusied(m))
 		vm_page_busy_downgrade(m);
 	else if ((allocflags & VM_ALLOC_NOBUSY) != 0)
 		vm_page_busy_release(m);
 	*mp = m;
 	return (VM_PAGER_OK);
 }
 
 /*
  * Locklessly grab a valid page.  If the page is not valid or not yet
  * allocated this will fall back to the object lock method.
  */
 int
 vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object,
     vm_pindex_t pindex, int allocflags)
 {
 	vm_page_t m;
 	int flags;
 	int error;
 
 	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
 	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
 	    ("vm_page_grab_valid_unlocked: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY "
 	    "mismatch"));
 	KASSERT((allocflags &
 	    (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0,
 	    ("vm_page_grab_valid_unlocked: Invalid flags 0x%X", allocflags));
 
 	/*
 	 * Attempt a lockless lookup and busy.  We need at least an sbusy
 	 * before we can inspect the valid field and return a wired page.
 	 */
 	flags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_WIRED);
 	if (!vm_page_acquire_unlocked(object, pindex, NULL, mp, flags))
 		return (VM_PAGER_FAIL);
 	if ((m = *mp) != NULL) {
 		if (vm_page_all_valid(m)) {
 			if ((allocflags & VM_ALLOC_WIRED) != 0)
 				vm_page_wire(m);
 			vm_page_grab_release(m, allocflags);
 			return (VM_PAGER_OK);
 		}
 		vm_page_busy_release(m);
 	}
 	if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
 		*mp = NULL;
 		return (VM_PAGER_FAIL);
 	}
 	VM_OBJECT_WLOCK(object);
 	error = vm_page_grab_valid(mp, object, pindex, allocflags);
 	VM_OBJECT_WUNLOCK(object);
 
 	return (error);
 }
 
 /*
  * Return the specified range of pages from the given object.  For each
  * page offset within the range, if a page already exists within the object
  * at that offset and it is busy, then wait for it to change state.  If,
  * instead, the page doesn't exist, then allocate it.
  *
  * The caller must always specify an allocation class.
  *
  * allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs the pages
  *
  * The caller must always specify that the pages are to be busied and/or
  * wired.
  *
  * optional allocation flags:
  *	VM_ALLOC_IGN_SBUSY	do not sleep on soft busy pages
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NOWAIT		do not sleep
  *	VM_ALLOC_SBUSY		set page to sbusy state
  *	VM_ALLOC_WIRED		wire the pages
  *	VM_ALLOC_ZERO		zero and validate any invalid pages
  *
  * If VM_ALLOC_NOWAIT is not specified, this routine may sleep.  Otherwise, it
  * may return a partial prefix of the requested range.
  */
 int
 vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
     vm_page_t *ma, int count)
 {
 	vm_page_t m, mpred;
 	int pflags;
 	int i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
 	    ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
 	KASSERT(count > 0,
 	    ("vm_page_grab_pages: invalid page count %d", count));
 	vm_page_grab_check(allocflags);
 
 	pflags = vm_page_grab_pflags(allocflags);
 	i = 0;
 retrylookup:
 	m = vm_radix_lookup_le(&object->rtree, pindex + i);
 	if (m == NULL || m->pindex != pindex + i) {
 		mpred = m;
 		m = NULL;
 	} else
 		mpred = TAILQ_PREV(m, pglist, listq);
 	for (; i < count; i++) {
 		if (m != NULL) {
 			if (!vm_page_tryacquire(m, allocflags)) {
 				if (vm_page_grab_sleep(object, m, pindex + i,
 				    "grbmaw", allocflags, true))
 					goto retrylookup;
 				break;
 			}
 		} else {
 			if ((allocflags & VM_ALLOC_NOCREAT) != 0)
 				break;
 			m = vm_page_alloc_after(object, pindex + i,
 			    pflags | VM_ALLOC_COUNT(count - i), mpred);
 			if (m == NULL) {
 				if ((allocflags & (VM_ALLOC_NOWAIT |
 				    VM_ALLOC_WAITFAIL)) != 0)
 					break;
 				goto retrylookup;
 			}
 		}
 		if (vm_page_none_valid(m) &&
 		    (allocflags & VM_ALLOC_ZERO) != 0) {
 			if ((m->flags & PG_ZERO) == 0)
 				pmap_zero_page(m);
 			vm_page_valid(m);
 		}
 		vm_page_grab_release(m, allocflags);
 		ma[i] = mpred = m;
 		m = vm_page_next(m);
 	}
 	return (i);
 }
 
 /*
  * Unlocked variant of vm_page_grab_pages().  This accepts the same flags
  * and will fall back to the locked variant to handle allocation.
  */
 int
 vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex,
     int allocflags, vm_page_t *ma, int count)
 {
 	vm_page_t m, pred;
 	int flags;
 	int i;
 
 	KASSERT(count > 0,
 	    ("vm_page_grab_pages_unlocked: invalid page count %d", count));
 	vm_page_grab_check(allocflags);
 
 	/*
 	 * Modify flags for lockless acquire to hold the page until we
 	 * set it valid if necessary.
 	 */
 	flags = allocflags & ~VM_ALLOC_NOBUSY;
 	pred = NULL;
 	for (i = 0; i < count; i++, pindex++) {
 		if (!vm_page_acquire_unlocked(object, pindex, pred, &m, flags))
 			return (i);
 		if (m == NULL)
 			break;
 		if ((flags & VM_ALLOC_ZERO) != 0 && vm_page_none_valid(m)) {
 			if ((m->flags & PG_ZERO) == 0)
 				pmap_zero_page(m);
 			vm_page_valid(m);
 		}
 		/* m will still be wired or busy according to flags. */
 		vm_page_grab_release(m, allocflags);
 		pred = ma[i] = m;
 	}
 	if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0)
 		return (i);
 	count -= i;
 	VM_OBJECT_WLOCK(object);
 	i += vm_page_grab_pages(object, pindex, allocflags, &ma[i], count);
 	VM_OBJECT_WUNLOCK(object);
 
 	return (i);
 }
 
 /*
  * Mapping function for valid or dirty bits in a page.
  *
  * Inputs are required to range within a page.
  */
 vm_page_bits_t
 vm_page_bits(int base, int size)
 {
 	int first_bit;
 	int last_bit;
 
 	KASSERT(
 	    base + size <= PAGE_SIZE,
 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
 	);
 
 	if (size == 0)		/* handle degenerate case */
 		return (0);
 
 	first_bit = base >> DEV_BSHIFT;
 	last_bit = (base + size - 1) >> DEV_BSHIFT;
 
 	return (((vm_page_bits_t)2 << last_bit) -
 	    ((vm_page_bits_t)1 << first_bit));
 }
 
 void
 vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set)
 {
 
 #if PAGE_SIZE == 32768
 	atomic_set_64((uint64_t *)bits, set);
 #elif PAGE_SIZE == 16384
 	atomic_set_32((uint32_t *)bits, set);
 #elif (PAGE_SIZE == 8192) && defined(atomic_set_16)
 	atomic_set_16((uint16_t *)bits, set);
 #elif (PAGE_SIZE == 4096) && defined(atomic_set_8)
 	atomic_set_8((uint8_t *)bits, set);
 #else		/* PAGE_SIZE <= 8192 */
 	uintptr_t addr;
 	int shift;
 
 	addr = (uintptr_t)bits;
 	/*
 	 * Use a trick to perform a 32-bit atomic on the
 	 * containing aligned word, to not depend on the existence
 	 * of atomic_{set, clear}_{8, 16}.
 	 */
 	shift = addr & (sizeof(uint32_t) - 1);
 #if BYTE_ORDER == BIG_ENDIAN
 	shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
 #else
 	shift *= NBBY;
 #endif
 	addr &= ~(sizeof(uint32_t) - 1);
 	atomic_set_32((uint32_t *)addr, set << shift);
 #endif		/* PAGE_SIZE */
 }
 
 static inline void
 vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear)
 {
 
 #if PAGE_SIZE == 32768
 	atomic_clear_64((uint64_t *)bits, clear);
 #elif PAGE_SIZE == 16384
 	atomic_clear_32((uint32_t *)bits, clear);
 #elif (PAGE_SIZE == 8192) && defined(atomic_clear_16)
 	atomic_clear_16((uint16_t *)bits, clear);
 #elif (PAGE_SIZE == 4096) && defined(atomic_clear_8)
 	atomic_clear_8((uint8_t *)bits, clear);
 #else		/* PAGE_SIZE <= 8192 */
 	uintptr_t addr;
 	int shift;
 
 	addr = (uintptr_t)bits;
 	/*
 	 * Use a trick to perform a 32-bit atomic on the
 	 * containing aligned word, to not depend on the existence
 	 * of atomic_{set, clear}_{8, 16}.
 	 */
 	shift = addr & (sizeof(uint32_t) - 1);
 #if BYTE_ORDER == BIG_ENDIAN
 	shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
 #else
 	shift *= NBBY;
 #endif
 	addr &= ~(sizeof(uint32_t) - 1);
 	atomic_clear_32((uint32_t *)addr, clear << shift);
 #endif		/* PAGE_SIZE */
 }
 
 static inline vm_page_bits_t
 vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits)
 {
 #if PAGE_SIZE == 32768
 	uint64_t old;
 
 	old = *bits;
 	while (atomic_fcmpset_64(bits, &old, newbits) == 0);
 	return (old);
 #elif PAGE_SIZE == 16384
 	uint32_t old;
 
 	old = *bits;
 	while (atomic_fcmpset_32(bits, &old, newbits) == 0);
 	return (old);
 #elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16)
 	uint16_t old;
 
 	old = *bits;
 	while (atomic_fcmpset_16(bits, &old, newbits) == 0);
 	return (old);
 #elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8)
 	uint8_t old;
 
 	old = *bits;
 	while (atomic_fcmpset_8(bits, &old, newbits) == 0);
 	return (old);
 #else		/* PAGE_SIZE <= 4096*/
 	uintptr_t addr;
 	uint32_t old, new, mask;
 	int shift;
 
 	addr = (uintptr_t)bits;
 	/*
 	 * Use a trick to perform a 32-bit atomic on the
 	 * containing aligned word, to not depend on the existence
 	 * of atomic_{set, swap, clear}_{8, 16}.
 	 */
 	shift = addr & (sizeof(uint32_t) - 1);
 #if BYTE_ORDER == BIG_ENDIAN
 	shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
 #else
 	shift *= NBBY;
 #endif
 	addr &= ~(sizeof(uint32_t) - 1);
 	mask = VM_PAGE_BITS_ALL << shift;
 
 	old = *bits;
 	do {
 		new = old & ~mask;
 		new |= newbits << shift;
 	} while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0);
 	return (old >> shift);
 #endif		/* PAGE_SIZE */
 }
 
 /*
  *	vm_page_set_valid_range:
  *
  *	Sets portions of a page valid.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
  *	of any partial chunks touched by the range.  The invalid portion of
  *	such chunks will be zeroed.
  *
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_valid_range(vm_page_t m, int base, int size)
 {
 	int endoff, frag;
 	vm_page_bits_t pagebits;
 
 	vm_page_assert_busied(m);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
 	/*
 	 * If the base is not DEV_BSIZE aligned and the valid
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
 	 * If the ending offset is not DEV_BSIZE aligned and the
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 
 	/*
 	 * Assert that no previously invalid block that is now being validated
 	 * is already dirty.
 	 */
 	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
 	    ("vm_page_set_valid_range: page %p is dirty", m));
 
 	/*
 	 * Set valid bits inclusive of any overlap.
 	 */
 	pagebits = vm_page_bits(base, size);
 	if (vm_page_xbusied(m))
 		m->valid |= pagebits;
 	else
 		vm_page_bits_set(m, &m->valid, pagebits);
 }
 
 /*
  * Set the page dirty bits and free the invalid swap space if
  * present.  Returns the previous dirty bits.
  */
 vm_page_bits_t
 vm_page_set_dirty(vm_page_t m)
 {
 	vm_page_bits_t old;
 
 	VM_PAGE_OBJECT_BUSY_ASSERT(m);
 
 	if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) {
 		old = m->dirty;
 		m->dirty = VM_PAGE_BITS_ALL;
 	} else
 		old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL);
 	if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0)
 		vm_pager_page_unswapped(m);
 
 	return (old);
 }
 
 /*
  * Clear the given bits from the specified page's dirty field.
  */
 static __inline void
 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
 {
 
 	vm_page_assert_busied(m);
 
 	/*
 	 * If the page is xbusied and not write mapped we are the
 	 * only thread that can modify dirty bits.  Otherwise, The pmap
 	 * layer can call vm_page_dirty() without holding a distinguished
 	 * lock.  The combination of page busy and atomic operations
 	 * suffice to guarantee consistency of the page dirty field.
 	 */
 	if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
 		m->dirty &= ~pagebits;
 	else
 		vm_page_bits_clear(m, &m->dirty, pagebits);
 }
 
 /*
  *	vm_page_set_validclean:
  *
  *	Sets portions of a page valid and clean.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
  *	of any partial chunks touched by the range.  The invalid portion of
  *	such chunks will be zero'd.
  *
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_validclean(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t oldvalid, pagebits;
 	int endoff, frag;
 
 	vm_page_assert_busied(m);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
 	/*
 	 * If the base is not DEV_BSIZE aligned and the valid
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
 	 * If the ending offset is not DEV_BSIZE aligned and the
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 
 	/*
 	 * Set valid, clear dirty bits.  If validating the entire
 	 * page we can safely clear the pmap modify bit.  We also
 	 * use this opportunity to clear the PGA_NOSYNC flag.  If a process
 	 * takes a write fault on a MAP_NOSYNC memory area the flag will
 	 * be set again.
 	 *
 	 * We set valid bits inclusive of any overlap, but we can only
 	 * clear dirty bits for DEV_BSIZE chunks that are fully within
 	 * the range.
 	 */
 	oldvalid = m->valid;
 	pagebits = vm_page_bits(base, size);
 	if (vm_page_xbusied(m))
 		m->valid |= pagebits;
 	else
 		vm_page_bits_set(m, &m->valid, pagebits);
 #if 0	/* NOT YET */
 	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
 		frag = DEV_BSIZE - frag;
 		base += frag;
 		size -= frag;
 		if (size < 0)
 			size = 0;
 	}
 	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
 #endif
 	if (base == 0 && size == PAGE_SIZE) {
 		/*
 		 * The page can only be modified within the pmap if it is
 		 * mapped, and it can only be mapped if it was previously
 		 * fully valid.
 		 */
 		if (oldvalid == VM_PAGE_BITS_ALL)
 			/*
 			 * Perform the pmap_clear_modify() first.  Otherwise,
 			 * a concurrent pmap operation, such as
 			 * pmap_protect(), could clear a modification in the
 			 * pmap and set the dirty field on the page before
 			 * pmap_clear_modify() had begun and after the dirty
 			 * field was cleared here.
 			 */
 			pmap_clear_modify(m);
 		m->dirty = 0;
 		vm_page_aflag_clear(m, PGA_NOSYNC);
 	} else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m))
 		m->dirty &= ~pagebits;
 	else
 		vm_page_clear_dirty_mask(m, pagebits);
 }
 
 void
 vm_page_clear_dirty(vm_page_t m, int base, int size)
 {
 
 	vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
 }
 
 /*
  *	vm_page_set_invalid:
  *
  *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
  *	valid and dirty bits for the effected areas are cleared.
  */
 void
 vm_page_set_invalid(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t bits;
 	vm_object_t object;
 
 	/*
 	 * The object lock is required so that pages can't be mapped
 	 * read-only while we're in the process of invalidating them.
 	 */
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_page_assert_busied(m);
 
 	if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
 	    size >= object->un_pager.vnp.vnp_size)
 		bits = VM_PAGE_BITS_ALL;
 	else
 		bits = vm_page_bits(base, size);
 	if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0)
 		pmap_remove_all(m);
 	KASSERT((bits == 0 && vm_page_all_valid(m)) ||
 	    !pmap_page_is_mapped(m),
 	    ("vm_page_set_invalid: page %p is mapped", m));
 	if (vm_page_xbusied(m)) {
 		m->valid &= ~bits;
 		m->dirty &= ~bits;
 	} else {
 		vm_page_bits_clear(m, &m->valid, bits);
 		vm_page_bits_clear(m, &m->dirty, bits);
 	}
 }
 
 /*
  *	vm_page_invalid:
  *
  *	Invalidates the entire page.  The page must be busy, unmapped, and
  *	the enclosing object must be locked.  The object locks protects
  *	against concurrent read-only pmap enter which is done without
  *	busy.
  */
 void
 vm_page_invalid(vm_page_t m)
 {
 
 	vm_page_assert_busied(m);
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	MPASS(!pmap_page_is_mapped(m));
 
 	if (vm_page_xbusied(m))
 		m->valid = 0;
 	else
 		vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL);
 }
 
 /*
  * vm_page_zero_invalid()
  *
  *	The kernel assumes that the invalid portions of a page contain
  *	garbage, but such pages can be mapped into memory by user code.
  *	When this occurs, we must zero out the non-valid portions of the
  *	page so user code sees what it expects.
  *
  *	Pages are most often semi-valid when the end of a file is mapped
  *	into memory and the file's size is not page aligned.
  */
 void
 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
 {
 	int b;
 	int i;
 
 	/*
 	 * Scan the valid bits looking for invalid sections that
 	 * must be zeroed.  Invalid sub-DEV_BSIZE'd areas ( where the
 	 * valid bit may be set ) have already been zeroed by
 	 * vm_page_set_validclean().
 	 */
 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
 		    (m->valid & ((vm_page_bits_t)1 << i))) {
 			if (i > b) {
 				pmap_zero_page_area(m,
 				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
 			}
 			b = i + 1;
 		}
 	}
 
 	/*
 	 * setvalid is TRUE when we can safely set the zero'd areas
 	 * as being valid.  We can do this if there are no cache consistancy
 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
 	 */
 	if (setvalid)
 		vm_page_valid(m);
 }
 
 /*
  *	vm_page_is_valid:
  *
  *	Is (partial) page valid?  Note that the case where size == 0
  *	will return FALSE in the degenerate case where the page is
  *	entirely invalid, and TRUE otherwise.
  *
  *	Some callers envoke this routine without the busy lock held and
  *	handle races via higher level locks.  Typical callers should
  *	hold a busy lock to prevent invalidation.
  */
 int
 vm_page_is_valid(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t bits;
 
 	bits = vm_page_bits(base, size);
 	return (m->valid != 0 && (m->valid & bits) == bits);
 }
 
 /*
  * Returns true if all of the specified predicates are true for the entire
  * (super)page and false otherwise.
  */
 bool
 vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
 {
 	vm_object_t object;
 	int i, npages;
 
 	object = m->object;
 	if (skip_m != NULL && skip_m->object != object)
 		return (false);
 	VM_OBJECT_ASSERT_LOCKED(object);
 	npages = atop(pagesizes[m->psind]);
 
 	/*
 	 * The physically contiguous pages that make up a superpage, i.e., a
 	 * page with a page size index ("psind") greater than zero, will
 	 * occupy adjacent entries in vm_page_array[].
 	 */
 	for (i = 0; i < npages; i++) {
 		/* Always test object consistency, including "skip_m". */
 		if (m[i].object != object)
 			return (false);
 		if (&m[i] == skip_m)
 			continue;
 		if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
 			return (false);
 		if ((flags & PS_ALL_DIRTY) != 0) {
 			/*
 			 * Calling vm_page_test_dirty() or pmap_is_modified()
 			 * might stop this case from spuriously returning
 			 * "false".  However, that would require a write lock
 			 * on the object containing "m[i]".
 			 */
 			if (m[i].dirty != VM_PAGE_BITS_ALL)
 				return (false);
 		}
 		if ((flags & PS_ALL_VALID) != 0 &&
 		    m[i].valid != VM_PAGE_BITS_ALL)
 			return (false);
 	}
 	return (true);
 }
 
 /*
  * Set the page's dirty bits if the page is modified.
  */
 void
 vm_page_test_dirty(vm_page_t m)
 {
 
 	vm_page_assert_busied(m);
 	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
 		vm_page_dirty(m);
 }
 
 void
 vm_page_valid(vm_page_t m)
 {
 
 	vm_page_assert_busied(m);
 	if (vm_page_xbusied(m))
 		m->valid = VM_PAGE_BITS_ALL;
 	else
 		vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL);
 }
 
 void
 vm_page_lock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
 }
 
 void
 vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
 }
 
 int
 vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
 }
 
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void
 vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
 {
 
 	vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
 }
 
 void
 vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
 {
 
 	mtx_assert_(vm_page_lockptr(m), a, file, line);
 }
 #endif
 
 #ifdef INVARIANTS
 void
 vm_page_object_busy_assert(vm_page_t m)
 {
 
 	/*
 	 * Certain of the page's fields may only be modified by the
 	 * holder of a page or object busy.
 	 */
 	if (m->object != NULL && !vm_page_busied(m))
 		VM_OBJECT_ASSERT_BUSY(m->object);
 }
 
 void
 vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits)
 {
 
 	if ((bits & PGA_WRITEABLE) == 0)
 		return;
 
 	/*
 	 * The PGA_WRITEABLE flag can only be set if the page is
 	 * managed, is exclusively busied or the object is locked.
 	 * Currently, this flag is only set by pmap_enter().
 	 */
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("PGA_WRITEABLE on unmanaged page"));
 	if (!vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_BUSY(m->object);
 }
 #endif
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(page, vm_page_print_page_info)
 {
 
 	db_printf("vm_cnt.v_free_count: %d\n", vm_free_count());
 	db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count());
 	db_printf("vm_cnt.v_active_count: %d\n", vm_active_count());
 	db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count());
 	db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count());
 	db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
 	db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
 	db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
 	db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
 }
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 {
 	int dom;
 
 	db_printf("pq_free %d\n", vm_free_count());
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		db_printf(
     "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n",
 		    dom,
 		    vm_dom[dom].vmd_page_count,
 		    vm_dom[dom].vmd_free_count,
 		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt);
 	}
 }
 
 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
 {
 	vm_page_t m;
 	boolean_t phys, virt;
 
 	if (!have_addr) {
 		db_printf("show pginfo addr\n");
 		return;
 	}
 
 	phys = strchr(modif, 'p') != NULL;
 	virt = strchr(modif, 'v') != NULL;
 	if (virt)
 		m = PHYS_TO_VM_PAGE(pmap_kextract(addr));
 	else if (phys)
 		m = PHYS_TO_VM_PAGE(addr);
 	else
 		m = (vm_page_t)addr;
 	db_printf(
     "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n"
     "  af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
 	    m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
 	    m->a.queue, m->ref_count, m->a.flags, m->oflags,
 	    m->flags, m->a.act_count, m->busy_lock, m->valid, m->dirty);
 }
 #endif /* DDB */