diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 16ec277e9c34..33defe79c8b9 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -1,1103 +1,1102 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1996, by Steve Passe * Copyright (c) 2003, by Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_sched.h" #include "opt_smp.h" #include #include #include #include #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_ACPI #include #include #endif #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) #define GiB(v) (v ## ULL << 30) #define AP_BOOTPT_SZ (PAGE_SIZE * 4) /* Temporary variables for init_secondary() */ static char *doublefault_stack; static char *mce_stack; static char *nmi_stack; static char *dbg_stack; void *bootpcpu; extern u_int mptramp_la57; extern u_int mptramp_nx; /* * Local data and functions. */ static int start_ap(int apic_id, vm_paddr_t boot_address); /* * Initialize the IPI handlers and start up the AP's. */ void cpu_mp_start(void) { int i; /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) { cpu_apic_ids[i] = -1; } /* Install an inter-CPU IPI for cache and TLB invalidations. */ setidt(IPI_INVLOP, pti ? IDTVEC(invlop_pti) : IDTVEC(invlop), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for all-CPU rendezvous */ setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) : IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); /* Install generic inter-CPU IPI handler */ setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) : IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU stop/restart */ setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); /* Install an IPI for calling delayed SWI */ setidt(IPI_SWI, pti ? IDTVEC(ipi_swi_pti) : IDTVEC(ipi_swi), SDT_SYSIGT, SEL_KPL, 0); /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); cpu_info[boot_cpu_id].cpu_bsp = 1; } else KASSERT(boot_cpu_id == PCPU_GET(apic_id), ("BSP's APIC ID doesn't match boot_cpu_id")); /* Probe logical/physical core configuration. */ topo_probe(); assign_cpu_ids(); mptramp_la57 = la57; mptramp_nx = pg_nx != 0; MPASS(kernel_pmap->pm_cr3 < (1UL << 32)); mptramp_pagetables = kernel_pmap->pm_cr3; /* Start each Application Processor */ start_all_aps(); set_interrupt_apic_ids(); #if defined(DEV_ACPI) && MAXMEMDOM > 1 acpi_pxm_set_cpu_locality(); #endif } /* * AP CPU's call this to initialize themselves. */ void init_secondary(void) { struct pcpu *pc; struct nmi_pcpu *np; struct user_segment_descriptor *gdt; struct region_descriptor ap_gdt; u_int64_t cr0; int cpu, gsel_tss, x; /* Set by the startup code for us to use */ cpu = bootAP; /* Update microcode before doing anything else. */ ucode_load_ap(cpu); /* Initialize the PCPU area. */ pc = bootpcpu; pcpu_init(pc, cpu, sizeof(struct pcpu)); dpcpu_init(dpcpu, cpu); pc->pc_apic_id = cpu_apic_ids[cpu]; pc->pc_prvspace = pc; pc->pc_curthread = 0; pc->pc_tssp = &pc->pc_common_tss; pc->pc_rsp0 = 0; pc->pc_pti_rsp0 = (((vm_offset_t)&pc->pc_pti_stack + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); gdt = pc->pc_gdt; pc->pc_tss = (struct system_segment_descriptor *)&gdt[GPROC0_SEL]; pc->pc_fs32p = &gdt[GUFS32_SEL]; pc->pc_gs32p = &gdt[GUGS32_SEL]; pc->pc_ldt = (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]; pc->pc_ucr3_load_mask = PMAP_UCR3_NOMASK; /* See comment in pmap_bootstrap(). */ pc->pc_pcid_next = PMAP_PCID_KERN + 2; pc->pc_pcid_gen = 1; pc->pc_smp_tlb_gen = 1; /* Init tss */ pc->pc_common_tss = __pcpu[0].pc_common_tss; pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; pc->pc_common_tss.tss_rsp0 = 0; /* The doublefault stack runs on IST1. */ np = ((struct nmi_pcpu *)&doublefault_stack[DBLFAULT_STACK_SIZE]) - 1; np->np_pcpu = (register_t)pc; pc->pc_common_tss.tss_ist1 = (long)np; /* The NMI stack runs on IST2. */ np = ((struct nmi_pcpu *)&nmi_stack[NMI_STACK_SIZE]) - 1; np->np_pcpu = (register_t)pc; pc->pc_common_tss.tss_ist2 = (long)np; /* The MC# stack runs on IST3. */ np = ((struct nmi_pcpu *)&mce_stack[MCE_STACK_SIZE]) - 1; np->np_pcpu = (register_t)pc; pc->pc_common_tss.tss_ist3 = (long)np; /* The DB# stack runs on IST4. */ np = ((struct nmi_pcpu *)&dbg_stack[DBG_STACK_SIZE]) - 1; np->np_pcpu = (register_t)pc; pc->pc_common_tss.tss_ist4 = (long)np; /* Prepare private GDT */ gdt_segs[GPROC0_SEL].ssd_base = (long)&pc->pc_common_tss; for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != GPROC0_SEL + 1 && x != GUSERLDT_SEL && x != GUSERLDT_SEL + 1) ssdtosd(&gdt_segs[x], &gdt[x]); } ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; ap_gdt.rd_base = (u_long)gdt; lgdt(&ap_gdt); /* does magic intra-segment return */ wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (uint64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value */ fix_cpuid(); lidt(&r_idt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ cr0 = rcr0(); cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); amd64_conf_fast_syscall(); /* signal our startup to the BSP. */ mp_naps++; /* Spin until the BSP releases the AP's. */ while (atomic_load_acq_int(&aps_ready) == 0) ia32_pause(); init_secondary_tail(); } /******************************************************************* * local functions and data */ #ifdef NUMA static void mp_realloc_pcpu(int cpuid, int domain) { vm_page_t m; vm_offset_t oa, na; oa = (vm_offset_t)&__pcpu[cpuid]; if (vm_phys_domain(pmap_kextract(oa)) == domain) return; - m = vm_page_alloc_domain(NULL, 0, domain, - VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); + m = vm_page_alloc_noobj_domain(domain, 0); if (m == NULL) return; na = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagecopy((void *)oa, (void *)na); pmap_qenter((vm_offset_t)&__pcpu[cpuid], &m, 1); /* XXX old pcpu page leaked. */ } #endif /* * start each AP in our list */ int start_all_aps(void) { vm_page_t m_boottramp, m_pml4, m_pdp, m_pd[4]; pml5_entry_t old_pml45; pml4_entry_t *v_pml4; pdp_entry_t *v_pdp; pd_entry_t *v_pd; vm_paddr_t boot_address; u_int32_t mpbioswarmvec; int apic_id, cpu, domain, i; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); MPASS(bootMP_size <= PAGE_SIZE); m_boottramp = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ, 1, 0, (1ULL << 20), /* Trampoline should be below 1M for real mode */ PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); boot_address = VM_PAGE_TO_PHYS(m_boottramp); /* Create a transient 1:1 mapping of low 4G */ if (la57) { m_pml4 = pmap_page_alloc_below_4g(true); v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); } else { v_pml4 = &kernel_pmap->pm_pmltop[0]; } m_pdp = pmap_page_alloc_below_4g(true); v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); m_pd[0] = pmap_page_alloc_below_4g(false); v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0])); for (i = 0; i < NPDEPG; i++) v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS; m_pd[1] = pmap_page_alloc_below_4g(false); v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1])); for (i = 0; i < NPDEPG; i++) v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS; m_pd[2] = pmap_page_alloc_below_4g(false); v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2])); for (i = 0; i < NPDEPG; i++) v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS; m_pd[3] = pmap_page_alloc_below_4g(false); v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3])); for (i = 0; i < NPDEPG; i++) v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS; v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; old_pml45 = kernel_pmap->pm_pmltop[0]; if (la57) { kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; } v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; pmap_invalidate_all(kernel_pmap); /* copy the AP 1st level boot code */ bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); if (bootverbose) printf("AP boot address %#lx\n", boot_address); /* save the current value of the warm-start vector */ if (!efi_boot) mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* setup a vector to our boot code */ if (!efi_boot) { *((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *)WARMBOOT_SEG) = (boot_address >> 4); } outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ /* Relocate pcpu areas to the correct domain. */ #ifdef NUMA if (vm_ndomains > 1) for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; domain = acpi_pxm_get_cpu_locality(apic_id); mp_realloc_pcpu(cpu, domain); } #endif /* start each AP */ domain = 0; for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; #ifdef NUMA if (vm_ndomains > 1) domain = acpi_pxm_get_cpu_locality(apic_id); #endif /* allocate and set up an idle stack data page */ bootstacks[cpu] = (void *)kmem_malloc(kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO); doublefault_stack = (char *)kmem_malloc(DBLFAULT_STACK_SIZE, M_WAITOK | M_ZERO); mce_stack = (char *)kmem_malloc(MCE_STACK_SIZE, M_WAITOK | M_ZERO); nmi_stack = (char *)kmem_malloc_domainset( DOMAINSET_PREF(domain), NMI_STACK_SIZE, M_WAITOK | M_ZERO); dbg_stack = (char *)kmem_malloc_domainset( DOMAINSET_PREF(domain), DBG_STACK_SIZE, M_WAITOK | M_ZERO); dpcpu = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain), DPCPU_SIZE, M_WAITOK | M_ZERO); bootpcpu = &__pcpu[cpu]; bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8; bootAP = cpu; /* attempt to start the Application Processor */ if (!start_ap(apic_id, boot_address)) { /* restore the warmstart vector */ if (!efi_boot) *(u_int32_t *)WARMBOOT_OFF = mpbioswarmvec; panic("AP #%d (PHY# %d) failed!", cpu, apic_id); } CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ } /* restore the warmstart vector */ if (!efi_boot) *(u_int32_t *)WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); /* Destroy transient 1:1 mapping */ kernel_pmap->pm_pmltop[0] = old_pml45; invlpg(0); if (la57) vm_page_free(m_pml4); vm_page_free(m_pd[3]); vm_page_free(m_pd[2]); vm_page_free(m_pd[1]); vm_page_free(m_pd[0]); vm_page_free(m_pdp); vm_page_free(m_boottramp); /* number of APs actually started */ return (mp_naps); } /* * This function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It isn't pretty, * but it seems to work. */ static int start_ap(int apic_id, vm_paddr_t boot_address) { int vector, ms; int cpus; /* calculate the vector */ vector = (boot_address >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_naps; ipi_startup(apic_id, vector); /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ DELAY(1000); } return 0; /* return FAILURE */ } /* * Flush the TLB on other CPU's */ /* * Invalidation request. PCPU pc_smp_tlb_op uses u_int instead of the * enum to avoid both namespace and ABI issues (with enums). */ enum invl_op_codes { INVL_OP_TLB = 1, INVL_OP_TLB_INVPCID = 2, INVL_OP_TLB_INVPCID_PTI = 3, INVL_OP_TLB_PCID = 4, INVL_OP_PGRNG = 5, INVL_OP_PGRNG_INVPCID = 6, INVL_OP_PGRNG_PCID = 7, INVL_OP_PG = 8, INVL_OP_PG_INVPCID = 9, INVL_OP_PG_PCID = 10, INVL_OP_CACHE = 11, }; /* * These variables are initialized at startup to reflect how each of * the different kinds of invalidations should be performed on the * current machine and environment. */ static enum invl_op_codes invl_op_tlb; static enum invl_op_codes invl_op_pgrng; static enum invl_op_codes invl_op_pg; /* * Scoreboard of IPI completion notifications from target to IPI initiator. * * Each CPU can initiate shootdown IPI independently from other CPUs. * Initiator enters critical section, then fills its local PCPU * shootdown info (pc_smp_tlb_ vars), then clears scoreboard generation * at location (cpu, my_cpuid) for each target cpu. After that IPI is * sent to all targets which scan for zeroed scoreboard generation * words. Upon finding such word the shootdown data is read from * corresponding cpu's pcpu, and generation is set. Meantime initiator * loops waiting for all zeroed generations in scoreboard to update. */ static uint32_t *invl_scoreboard; static void invl_scoreboard_init(void *arg __unused) { u_int i; invl_scoreboard = malloc(sizeof(uint32_t) * (mp_maxid + 1) * (mp_maxid + 1), M_DEVBUF, M_WAITOK); for (i = 0; i < (mp_maxid + 1) * (mp_maxid + 1); i++) invl_scoreboard[i] = 1; if (pmap_pcid_enabled) { if (invpcid_works) { if (pti) invl_op_tlb = INVL_OP_TLB_INVPCID_PTI; else invl_op_tlb = INVL_OP_TLB_INVPCID; invl_op_pgrng = INVL_OP_PGRNG_INVPCID; invl_op_pg = INVL_OP_PG_INVPCID; } else { invl_op_tlb = INVL_OP_TLB_PCID; invl_op_pgrng = INVL_OP_PGRNG_PCID; invl_op_pg = INVL_OP_PG_PCID; } } else { invl_op_tlb = INVL_OP_TLB; invl_op_pgrng = INVL_OP_PGRNG; invl_op_pg = INVL_OP_PG; } } SYSINIT(invl_ops, SI_SUB_SMP, SI_ORDER_FIRST, invl_scoreboard_init, NULL); static uint32_t * invl_scoreboard_getcpu(u_int cpu) { return (invl_scoreboard + cpu * (mp_maxid + 1)); } static uint32_t * invl_scoreboard_slot(u_int cpu) { return (invl_scoreboard_getcpu(cpu) + PCPU_GET(cpuid)); } /* * Used by the pmap to request cache or TLB invalidation on local and * remote processors. Mask provides the set of remote CPUs that are * to be signalled with the invalidation IPI. As an optimization, the * curcpu_cb callback is invoked on the calling CPU in a critical * section while waiting for the remote CPUs to complete the operation. * * The callback function is called unconditionally on the caller's * underlying processor, even when this processor is not set in the * mask. So, the callback function must be prepared to handle such * spurious invocations. * * Interrupts must be enabled when calling the function with smp * started, to avoid deadlock with other IPIs that are protected with * smp_ipi_mtx spinlock at the initiator side. * * Function must be called with the thread pinned, and it unpins on * completion. */ static void smp_targeted_tlb_shootdown(cpuset_t mask, pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2, smp_invl_cb_t curcpu_cb, enum invl_op_codes op) { cpuset_t other_cpus; uint32_t generation, *p_cpudone; int cpu; bool is_all; /* * It is not necessary to signal other CPUs while booting or * when in the debugger. */ if (kdb_active || KERNEL_PANICKED() || !smp_started) goto local_cb; KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); /* * Check for other cpus. Return if none. */ is_all = !CPU_CMP(&mask, &all_cpus); CPU_CLR(PCPU_GET(cpuid), &mask); if (CPU_EMPTY(&mask)) goto local_cb; /* * Initiator must have interrupts enabled, which prevents * non-invalidation IPIs that take smp_ipi_mtx spinlock, * from deadlocking with us. On the other hand, preemption * must be disabled to pin initiator to the instance of the * pcpu pc_smp_tlb data and scoreboard line. */ KASSERT((read_rflags() & PSL_I) != 0, ("smp_targeted_tlb_shootdown: interrupts disabled")); critical_enter(); PCPU_SET(smp_tlb_addr1, addr1); PCPU_SET(smp_tlb_addr2, addr2); PCPU_SET(smp_tlb_pmap, pmap); generation = PCPU_GET(smp_tlb_gen); if (++generation == 0) generation = 1; PCPU_SET(smp_tlb_gen, generation); PCPU_SET(smp_tlb_op, op); /* Fence between filling smp_tlb fields and clearing scoreboard. */ atomic_thread_fence_rel(); CPU_FOREACH_ISSET(cpu, &mask) { KASSERT(*invl_scoreboard_slot(cpu) != 0, ("IPI scoreboard is zero, initiator %d target %d", PCPU_GET(cpuid), cpu)); *invl_scoreboard_slot(cpu) = 0; } /* * IPI acts as a fence between writing to the scoreboard above * (zeroing slot) and reading from it below (wait for * acknowledgment). */ if (is_all) { ipi_all_but_self(IPI_INVLOP); other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); } else { other_cpus = mask; ipi_selected(mask, IPI_INVLOP); } curcpu_cb(pmap, addr1, addr2); CPU_FOREACH_ISSET(cpu, &other_cpus) { p_cpudone = invl_scoreboard_slot(cpu); while (atomic_load_int(p_cpudone) != generation) ia32_pause(); } /* * Unpin before leaving critical section. If the thread owes * preemption, this allows scheduler to select thread on any * CPU from its cpuset. */ sched_unpin(); critical_exit(); return; local_cb: critical_enter(); curcpu_cb(pmap, addr1, addr2); sched_unpin(); critical_exit(); } void smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb) { smp_targeted_tlb_shootdown(mask, pmap, 0, 0, curcpu_cb, invl_op_tlb); #ifdef COUNT_XINVLTLB_HITS ipi_global++; #endif } void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap, smp_invl_cb_t curcpu_cb) { smp_targeted_tlb_shootdown(mask, pmap, addr, 0, curcpu_cb, invl_op_pg); #ifdef COUNT_XINVLTLB_HITS ipi_page++; #endif } void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2, pmap_t pmap, smp_invl_cb_t curcpu_cb) { smp_targeted_tlb_shootdown(mask, pmap, addr1, addr2, curcpu_cb, invl_op_pgrng); #ifdef COUNT_XINVLTLB_HITS ipi_range++; ipi_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } void smp_cache_flush(smp_invl_cb_t curcpu_cb) { smp_targeted_tlb_shootdown(all_cpus, NULL, 0, 0, curcpu_cb, INVL_OP_CACHE); } /* * Handlers for TLB related IPIs */ static void invltlb_handler(pmap_t smp_tlb_pmap) { #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ if (smp_tlb_pmap == kernel_pmap) invltlb_glob(); else invltlb(); } static void invltlb_invpcid_handler(pmap_t smp_tlb_pmap) { struct invpcid_descr d; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB : INVPCID_CTX); } static void invltlb_invpcid_pti_handler(pmap_t smp_tlb_pmap) { struct invpcid_descr d; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; d.pad = 0; d.addr = 0; if (smp_tlb_pmap == kernel_pmap) { /* * This invalidation actually needs to clear kernel * mappings from the TLB in the current pmap, but * since we were asked for the flush in the kernel * pmap, achieve it by performing global flush. */ invpcid(&d, INVPCID_CTXGLOB); } else { invpcid(&d, INVPCID_CTX); if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); } } static void invltlb_pcid_handler(pmap_t smp_tlb_pmap) { uint32_t pcid; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ if (smp_tlb_pmap == kernel_pmap) { invltlb_glob(); } else { /* * The current pmap might not be equal to * smp_tlb_pmap. The clearing of the pm_gen in * pmap_invalidate_all() takes care of TLB * invalidation when switching to the pmap on this * CPU. */ if (smp_tlb_pmap == PCPU_GET(curpmap)) { pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; load_cr3(smp_tlb_pmap->pm_cr3 | pcid); if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); } } } static void invlpg_handler(vm_offset_t smp_tlb_addr1) { #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ invlpg(smp_tlb_addr1); } static void invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1) { struct invpcid_descr d; #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ invlpg(smp_tlb_addr1); if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = smp_tlb_addr1; invpcid(&d, INVPCID_ADDR); } } static void invlpg_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1) { uint64_t kcr3, ucr3; uint32_t pcid; #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ invlpg(smp_tlb_addr1); if (smp_tlb_pmap == PCPU_GET(curpmap) && (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, smp_tlb_addr1); } } static void invlrng_handler(vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2) { vm_offset_t addr, addr2; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; addr2 = smp_tlb_addr2; do { invlpg(addr); addr += PAGE_SIZE; } while (addr < addr2); } static void invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2) { struct invpcid_descr d; vm_offset_t addr, addr2; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; addr2 = smp_tlb_addr2; do { invlpg(addr); addr += PAGE_SIZE; } while (addr < addr2); if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = smp_tlb_addr1; do { invpcid(&d, INVPCID_ADDR); d.addr += PAGE_SIZE; } while (d.addr < addr2); } } static void invlrng_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2) { vm_offset_t addr, addr2; uint64_t kcr3, ucr3; uint32_t pcid; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; addr2 = smp_tlb_addr2; do { invlpg(addr); addr += PAGE_SIZE; } while (addr < addr2); if (smp_tlb_pmap == PCPU_GET(curpmap) && (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, smp_tlb_addr1, addr2); } } static void invlcache_handler(void) { #ifdef COUNT_IPIS (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ wbinvd(); } static void invlop_handler_one_req(enum invl_op_codes smp_tlb_op, pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2) { switch (smp_tlb_op) { case INVL_OP_TLB: invltlb_handler(smp_tlb_pmap); break; case INVL_OP_TLB_INVPCID: invltlb_invpcid_handler(smp_tlb_pmap); break; case INVL_OP_TLB_INVPCID_PTI: invltlb_invpcid_pti_handler(smp_tlb_pmap); break; case INVL_OP_TLB_PCID: invltlb_pcid_handler(smp_tlb_pmap); break; case INVL_OP_PGRNG: invlrng_handler(smp_tlb_addr1, smp_tlb_addr2); break; case INVL_OP_PGRNG_INVPCID: invlrng_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1, smp_tlb_addr2); break; case INVL_OP_PGRNG_PCID: invlrng_pcid_handler(smp_tlb_pmap, smp_tlb_addr1, smp_tlb_addr2); break; case INVL_OP_PG: invlpg_handler(smp_tlb_addr1); break; case INVL_OP_PG_INVPCID: invlpg_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1); break; case INVL_OP_PG_PCID: invlpg_pcid_handler(smp_tlb_pmap, smp_tlb_addr1); break; case INVL_OP_CACHE: invlcache_handler(); break; default: __assert_unreachable(); break; } } void invlop_handler(void) { struct pcpu *initiator_pc; pmap_t smp_tlb_pmap; vm_offset_t smp_tlb_addr1, smp_tlb_addr2; u_int initiator_cpu_id; enum invl_op_codes smp_tlb_op; uint32_t *scoreboard, smp_tlb_gen; scoreboard = invl_scoreboard_getcpu(PCPU_GET(cpuid)); for (;;) { for (initiator_cpu_id = 0; initiator_cpu_id <= mp_maxid; initiator_cpu_id++) { if (atomic_load_int(&scoreboard[initiator_cpu_id]) == 0) break; } if (initiator_cpu_id > mp_maxid) break; initiator_pc = cpuid_to_pcpu[initiator_cpu_id]; /* * This acquire fence and its corresponding release * fence in smp_targeted_tlb_shootdown() is between * reading zero scoreboard slot and accessing PCPU of * initiator for pc_smp_tlb values. */ atomic_thread_fence_acq(); smp_tlb_pmap = initiator_pc->pc_smp_tlb_pmap; smp_tlb_addr1 = initiator_pc->pc_smp_tlb_addr1; smp_tlb_addr2 = initiator_pc->pc_smp_tlb_addr2; smp_tlb_op = initiator_pc->pc_smp_tlb_op; smp_tlb_gen = initiator_pc->pc_smp_tlb_gen; /* * Ensure that we do not make our scoreboard * notification visible to the initiator until the * pc_smp_tlb values are read. The corresponding * fence is implicitly provided by the barrier in the * IPI send operation before the APIC ICR register * write. * * As an optimization, the request is acknowledged * before the actual invalidation is performed. It is * safe because target CPU cannot return to userspace * before handler finishes. Only NMI can preempt the * handler, but NMI would see the kernel handler frame * and not touch not-invalidated user page table. */ atomic_thread_fence_acq(); atomic_store_int(&scoreboard[initiator_cpu_id], smp_tlb_gen); invlop_handler_one_req(smp_tlb_op, smp_tlb_pmap, smp_tlb_addr1, smp_tlb_addr2); } } diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 168c9eec1b66..8d6c81a5459b 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1,12001 +1,11987 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * Copyright (c) 2014-2020 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #define AMD64_NPT_AWARE #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_ddb.h" #include "opt_pmap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #ifdef NUMA #define PMAP_MEMDOM MAXMEMDOM #else #define PMAP_MEMDOM 1 #endif static __inline boolean_t pmap_type_guest(pmap_t pmap) { return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); } static __inline boolean_t pmap_emulate_ad_bits(pmap_t pmap) { return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); } static __inline pt_entry_t pmap_valid_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_V; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_V; else mask = EPT_PG_READ; break; default: panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_rw_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_RW; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_RW; else mask = EPT_PG_WRITE; break; default: panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static pt_entry_t pg_g; static __inline pt_entry_t pmap_global_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: mask = pg_g; break; case PT_RVI: case PT_EPT: mask = 0; break; default: panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_accessed_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_A; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_READ; else mask = EPT_PG_A; break; default: panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_modified_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_M; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_WRITE; else mask = EPT_PG_M; break; default: panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_pku_mask_bit(pmap_t pmap) { return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); } #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #undef pa_index #ifdef NUMA #define pa_index(pa) ({ \ KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ ("address %lx beyond the last segment", (pa))); \ (pa) >> PDRSHIFT; \ }) #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ struct rwlock *_lock; \ if (__predict_false((pa) > pmap_last_pa)) \ _lock = &pv_dummy_large.pv_lock; \ else \ _lock = &(pa_to_pmdp(pa)->pv_lock); \ _lock; \ }) #else #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #endif #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int nkpt; SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, "Number of kernel page table pages allocated on bootup"); static int ndmpdp; vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM/pmap parameters"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); int __read_frequently la57 = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &la57, 0, "5-level paging for host is enabled"); static bool pmap_is_la57(pmap_t pmap) { if (pmap->pm_type == PT_X86) return (la57); return (false); /* XXXKIB handle EPT */ } #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ u_int64_t KPML5phys; /* phys addr of kernel level 5, if supported */ #ifdef KASAN static uint64_t KASANPDPphys; #endif #ifdef KMSAN static uint64_t KMSANSHADPDPphys; static uint64_t KMSANORIGPDPphys; /* * To support systems with large amounts of memory, it is necessary to extend * the maximum size of the direct map. This could eat into the space reserved * for the shadow map. */ _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); #endif static pml4_entry_t *kernel_pml4; static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ vm_paddr_t KERNend; /* and the end */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv list lock but reads are not. */ #ifdef NUMA static __inline int pc_to_domain(struct pv_chunk *pc) { return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); } #else static __inline int pc_to_domain(struct pv_chunk *pc __unused) { return (0); } #endif struct pv_chunks_list { struct mtx pvc_lock; TAILQ_HEAD(pch, pv_chunk) pvc_list; int active_reclaims; } __aligned(CACHE_LINE_SIZE); struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; #ifdef NUMA struct pmap_large_md_page { struct rwlock pv_lock; struct md_page pv_page; u_long pv_invl_gen; }; __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; #define pv_dummy pv_dummy_large.pv_page __read_mostly static struct pmap_large_md_page *pv_table; __read_mostly vm_paddr_t pmap_last_pa; #else static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; #endif /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = NULL; caddr_t CADDR1 = 0; static vm_offset_t qframe = 0; static struct mtx qframe_mtx; static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ static vmem_t *large_vmem; static u_int lm_ents; #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); int __read_frequently pti = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pti, 0, "Page Table Isolation enabled"); static vm_object_t pti_obj; static pml4_entry_t *pti_pml4; static vm_pindex_t pti_pg_idx; static bool pti_finalized; struct pmap_pkru_range { struct rs_el pkru_rs_el; u_int pkru_keyidx; int pkru_flags; }; static uma_zone_t pmap_pkru_ranges_zone; static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void *pkru_dup_range(void *ctx, void *data); static void pkru_free_range(void *ctx, void *node); static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void pmap_pkru_deassign_all(pmap_t pmap); static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD, &pcid_save_cnt, "Count of saved TLB context on switch"); static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); static struct mtx invl_gen_mtx; /* Fake lock object to satisfy turnstiles interface. */ static struct lock_object invl_gen_ts = { .lo_name = "invlts", }; static struct pmap_invl_gen pmap_invl_gen_head = { .gen = 1, .next = NULL, }; static u_long pmap_invl_gen = 1; static int pmap_invl_waiters; static struct callout pmap_invl_callout; static bool pmap_invl_callout_inited; #define PMAP_ASSERT_NOT_IN_DI() \ KASSERT(pmap_not_in_di(), ("DI already started")) static bool pmap_di_locked(void) { int tun; if ((cpu_feature2 & CPUID2_CX16) == 0) return (true); tun = 0; TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); return (tun != 0); } static int sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) { int locked; locked = pmap_di_locked(); return (sysctl_handle_int(oidp, &locked, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", "Locked delayed invalidation"); static bool pmap_not_in_di_l(void); static bool pmap_not_in_di_u(void); DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) { return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); } static bool pmap_not_in_di_l(void) { struct pmap_invl_gen *invl_gen; invl_gen = &curthread->td_md.md_invl_gen; return (invl_gen->gen == 0); } static void pmap_thread_init_invl_gen_l(struct thread *td) { struct pmap_invl_gen *invl_gen; invl_gen = &td->td_md.md_invl_gen; invl_gen->gen = 0; } static void pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) { struct turnstile *ts; ts = turnstile_trywait(&invl_gen_ts); if (*m_gen > atomic_load_long(invl_gen)) turnstile_wait(ts, NULL, TS_SHARED_QUEUE); else turnstile_cancel(ts); } static void pmap_delayed_invl_finish_unblock(u_long new_gen) { struct turnstile *ts; turnstile_chain_lock(&invl_gen_ts); ts = turnstile_lookup(&invl_gen_ts); if (new_gen != 0) pmap_invl_gen = new_gen; if (ts != NULL) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts); } turnstile_chain_unlock(&invl_gen_ts); } /* * Start a new Delayed Invalidation (DI) block of code, executed by * the current thread. Within a DI block, the current thread may * destroy both the page table and PV list entries for a mapping and * then release the corresponding PV list lock before ensuring that * the mapping is flushed from the TLBs of any processors with the * pmap active. */ static void pmap_delayed_invl_start_l(void) { struct pmap_invl_gen *invl_gen; u_long currgen; invl_gen = &curthread->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); mtx_lock(&invl_gen_mtx); if (LIST_EMPTY(&pmap_invl_gen_tracker)) currgen = pmap_invl_gen; else currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; invl_gen->gen = currgen + 1; LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); mtx_unlock(&invl_gen_mtx); } /* * Finish the DI block, previously started by the current thread. All * required TLB flushes for the pages marked by * pmap_delayed_invl_page() must be finished before this function is * called. * * This function works by bumping the global DI generation number to * the generation number of the current thread's DI, unless there is a * pending DI that started earlier. In the latter case, bumping the * global DI generation number would incorrectly signal that the * earlier DI had finished. Instead, this function bumps the earlier * DI's generation number to match the generation number of the * current thread's DI. */ static void pmap_delayed_invl_finish_l(void) { struct pmap_invl_gen *invl_gen, *next; invl_gen = &curthread->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_start")); mtx_lock(&invl_gen_mtx); next = LIST_NEXT(invl_gen, link); if (next == NULL) pmap_delayed_invl_finish_unblock(invl_gen->gen); else next->gen = invl_gen->gen; LIST_REMOVE(invl_gen, link); mtx_unlock(&invl_gen_mtx); invl_gen->gen = 0; } static bool pmap_not_in_di_u(void) { struct pmap_invl_gen *invl_gen; invl_gen = &curthread->td_md.md_invl_gen; return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); } static void pmap_thread_init_invl_gen_u(struct thread *td) { struct pmap_invl_gen *invl_gen; invl_gen = &td->td_md.md_invl_gen; invl_gen->gen = 0; invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; } static bool pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) { uint64_t new_high, new_low, old_high, old_low; char res; old_low = new_low = 0; old_high = new_high = (uintptr_t)0; __asm volatile("lock;cmpxchg16b\t%1" : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) : "b"(new_low), "c" (new_high) : "memory", "cc"); if (res == 0) { if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) return (false); out->gen = old_low; out->next = (void *)old_high; } else { out->gen = new_low; out->next = (void *)new_high; } return (true); } static bool pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, struct pmap_invl_gen *new_val) { uint64_t new_high, new_low, old_high, old_low; char res; new_low = new_val->gen; new_high = (uintptr_t)new_val->next; old_low = old_val->gen; old_high = (uintptr_t)old_val->next; __asm volatile("lock;cmpxchg16b\t%1" : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) : "b"(new_low), "c" (new_high) : "memory", "cc"); return (res); } static COUNTER_U64_DEFINE_EARLY(pv_page_count); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD, &pv_page_count, "Current number of allocated pv pages"); static COUNTER_U64_DEFINE_EARLY(user_pt_page_count); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD, &user_pt_page_count, "Current number of allocated page table pages for userspace"); static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD, &kernel_pt_page_count, "Current number of allocated page table pages for the kernel"); #ifdef PV_STATS static COUNTER_U64_DEFINE_EARLY(invl_start_restart); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD, &invl_start_restart, "Number of delayed TLB invalidation request restarts"); static COUNTER_U64_DEFINE_EARLY(invl_finish_restart); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, &invl_finish_restart, "Number of delayed TLB invalidation completion restarts"); static int invl_max_qlen; SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, &invl_max_qlen, 0, "Maximum delayed TLB invalidation request queue length"); #endif #define di_delay locks_delay static void pmap_delayed_invl_start_u(void) { struct pmap_invl_gen *invl_gen, *p, prev, new_prev; struct thread *td; struct lock_delay_arg lda; uintptr_t prevl; u_char pri; #ifdef PV_STATS int i, ii; #endif td = curthread; invl_gen = &td->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); lock_delay_arg_init(&lda, &di_delay); invl_gen->saved_pri = 0; pri = td->td_base_pri; if (pri > PVM) { thread_lock(td); pri = td->td_base_pri; if (pri > PVM) { invl_gen->saved_pri = pri; sched_prio(td, PVM); } thread_unlock(td); } again: PV_STAT(i = 0); for (p = &pmap_invl_gen_head;; p = prev.next) { PV_STAT(i++); prevl = (uintptr_t)atomic_load_ptr(&p->next); if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { PV_STAT(counter_u64_add(invl_start_restart, 1)); lock_delay(&lda); goto again; } if (prevl == 0) break; prev.next = (void *)prevl; } #ifdef PV_STATS if ((ii = invl_max_qlen) < i) atomic_cmpset_int(&invl_max_qlen, ii, i); #endif if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { PV_STAT(counter_u64_add(invl_start_restart, 1)); lock_delay(&lda); goto again; } new_prev.gen = prev.gen; new_prev.next = invl_gen; invl_gen->gen = prev.gen + 1; /* Formal fence between store to invl->gen and updating *p. */ atomic_thread_fence_rel(); /* * After inserting an invl_gen element with invalid bit set, * this thread blocks any other thread trying to enter the * delayed invalidation block. Do not allow to remove us from * the CPU, because it causes starvation for other threads. */ critical_enter(); /* * ABA for *p is not possible there, since p->gen can only * increase. So if the *p thread finished its di, then * started a new one and got inserted into the list at the * same place, its gen will appear greater than the previously * read gen. */ if (!pmap_di_store_invl(p, &prev, &new_prev)) { critical_exit(); PV_STAT(counter_u64_add(invl_start_restart, 1)); lock_delay(&lda); goto again; } /* * There we clear PMAP_INVL_GEN_NEXT_INVALID in * invl_gen->next, allowing other threads to iterate past us. * pmap_di_store_invl() provides fence between the generation * write and the update of next. */ invl_gen->next = NULL; critical_exit(); } static bool pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, struct pmap_invl_gen *p) { struct pmap_invl_gen prev, new_prev; u_long mygen; /* * Load invl_gen->gen after setting invl_gen->next * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger * generations to propagate to our invl_gen->gen. Lock prefix * in atomic_set_ptr() worked as seq_cst fence. */ mygen = atomic_load_long(&invl_gen->gen); if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) return (false); KASSERT(prev.gen < mygen, ("invalid di gen sequence %lu %lu", prev.gen, mygen)); new_prev.gen = mygen; new_prev.next = (void *)((uintptr_t)invl_gen->next & ~PMAP_INVL_GEN_NEXT_INVALID); /* Formal fence between load of prev and storing update to it. */ atomic_thread_fence_rel(); return (pmap_di_store_invl(p, &prev, &new_prev)); } static void pmap_delayed_invl_finish_u(void) { struct pmap_invl_gen *invl_gen, *p; struct thread *td; struct lock_delay_arg lda; uintptr_t prevl; td = curthread; invl_gen = &td->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, ("missed invl_start: INVALID")); lock_delay_arg_init(&lda, &di_delay); again: for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { prevl = (uintptr_t)atomic_load_ptr(&p->next); if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { PV_STAT(counter_u64_add(invl_finish_restart, 1)); lock_delay(&lda); goto again; } if ((void *)prevl == invl_gen) break; } /* * It is legitimate to not find ourself on the list if a * thread before us finished its DI and started it again. */ if (__predict_false(p == NULL)) { PV_STAT(counter_u64_add(invl_finish_restart, 1)); lock_delay(&lda); goto again; } critical_enter(); atomic_set_ptr((uintptr_t *)&invl_gen->next, PMAP_INVL_GEN_NEXT_INVALID); if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { atomic_clear_ptr((uintptr_t *)&invl_gen->next, PMAP_INVL_GEN_NEXT_INVALID); critical_exit(); PV_STAT(counter_u64_add(invl_finish_restart, 1)); lock_delay(&lda); goto again; } critical_exit(); if (atomic_load_int(&pmap_invl_waiters) > 0) pmap_delayed_invl_finish_unblock(0); if (invl_gen->saved_pri != 0) { thread_lock(td); sched_prio(td, invl_gen->saved_pri); thread_unlock(td); } } #ifdef DDB DB_SHOW_COMMAND(di_queue, pmap_di_queue) { struct pmap_invl_gen *p, *pn; struct thread *td; uintptr_t nextl; bool first; for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, first = false) { nextl = (uintptr_t)atomic_load_ptr(&p->next); pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); td = first ? NULL : __containerof(p, struct thread, td_md.md_invl_gen); db_printf("gen %lu inv %d td %p tid %d\n", p->gen, (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, td != NULL ? td->td_tid : -1); } } #endif #ifdef PV_STATS static COUNTER_U64_DEFINE_EARLY(invl_wait); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, "Number of times DI invalidation blocked pmap_remove_all/write"); static COUNTER_U64_DEFINE_EARLY(invl_wait_slow); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, "Number of slow invalidation waits for lockless DI"); #endif #ifdef NUMA static u_long * pmap_delayed_invl_genp(vm_page_t m) { vm_paddr_t pa; u_long *gen; pa = VM_PAGE_TO_PHYS(m); if (__predict_false((pa) > pmap_last_pa)) gen = &pv_dummy_large.pv_invl_gen; else gen = &(pa_to_pmdp(pa)->pv_invl_gen); return (gen); } #else static u_long * pmap_delayed_invl_genp(vm_page_t m) { return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); } #endif static void pmap_delayed_invl_callout_func(void *arg __unused) { if (atomic_load_int(&pmap_invl_waiters) == 0) return; pmap_delayed_invl_finish_unblock(0); } static void pmap_delayed_invl_callout_init(void *arg __unused) { if (pmap_di_locked()) return; callout_init(&pmap_invl_callout, 1); pmap_invl_callout_inited = true; } SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_delayed_invl_callout_init, NULL); /* * Ensure that all currently executing DI blocks, that need to flush * TLB for the given page m, actually flushed the TLB at the time the * function returned. If the page m has an empty PV list and we call * pmap_delayed_invl_wait(), upon its return we know that no CPU has a * valid mapping for the page m in either its page table or TLB. * * This function works by blocking until the global DI generation * number catches up with the generation number associated with the * given page m and its PV list. Since this function's callers * typically own an object lock and sometimes own a page lock, it * cannot sleep. Instead, it blocks on a turnstile to relinquish the * processor. */ static void pmap_delayed_invl_wait_l(vm_page_t m) { u_long *m_gen; #ifdef PV_STATS bool accounted = false; #endif m_gen = pmap_delayed_invl_genp(m); while (*m_gen > pmap_invl_gen) { #ifdef PV_STATS if (!accounted) { counter_u64_add(invl_wait, 1); accounted = true; } #endif pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); } } static void pmap_delayed_invl_wait_u(vm_page_t m) { u_long *m_gen; struct lock_delay_arg lda; bool fast; fast = true; m_gen = pmap_delayed_invl_genp(m); lock_delay_arg_init(&lda, &di_delay); while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { if (fast || !pmap_invl_callout_inited) { PV_STAT(counter_u64_add(invl_wait, 1)); lock_delay(&lda); fast = false; } else { /* * The page's invalidation generation number * is still below the current thread's number. * Prepare to block so that we do not waste * CPU cycles or worse, suffer livelock. * * Since it is impossible to block without * racing with pmap_delayed_invl_finish_u(), * prepare for the race by incrementing * pmap_invl_waiters and arming a 1-tick * callout which will unblock us if we lose * the race. */ atomic_add_int(&pmap_invl_waiters, 1); /* * Re-check the current thread's invalidation * generation after incrementing * pmap_invl_waiters, so that there is no race * with pmap_delayed_invl_finish_u() setting * the page generation and checking * pmap_invl_waiters. The only race allowed * is for a missed unblock, which is handled * by the callout. */ if (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { callout_reset(&pmap_invl_callout, 1, pmap_delayed_invl_callout_func, NULL); PV_STAT(counter_u64_add(invl_wait_slow, 1)); pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen_head.gen); } atomic_add_int(&pmap_invl_waiters, -1); } } } DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) { return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : pmap_thread_init_invl_gen_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) { return (pmap_di_locked() ? pmap_delayed_invl_start_l : pmap_delayed_invl_start_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) { return (pmap_di_locked() ? pmap_delayed_invl_finish_l : pmap_delayed_invl_finish_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) { return (pmap_di_locked() ? pmap_delayed_invl_wait_l : pmap_delayed_invl_wait_u); } /* * Mark the page m's PV list as participating in the current thread's * DI block. Any threads concurrently using m's PV list to remove or * restrict all mappings to m will wait for the current thread's DI * block to complete before proceeding. * * The function works by setting the DI generation number for m's PV * list to at least the DI generation number of the current thread. * This forces a caller of pmap_delayed_invl_wait() to block until * current thread calls pmap_delayed_invl_finish(). */ static void pmap_delayed_invl_page(vm_page_t m) { u_long gen, *m_gen; rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); gen = curthread->td_md.md_invl_gen.gen; if (gen == 0) return; m_gen = pmap_delayed_invl_genp(m); if (*m_gen < gen) *m_gen = gen; } /* * Crashdump maps. */ static caddr_t crashdumpmap; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ /* * Internal flags for pmap_mapdev_internal() and * pmap_change_props_locked(). */ #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */ #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */ #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */ TAILQ_HEAD(pv_chunklist, pv_chunk); static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_chunk_batch(struct pv_chunklist *batch); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static int popcnt_pc_map_pq(uint64_t *map); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, int mode, int flags); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static vm_page_t pmap_large_map_getptp_unlocked(void); static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec); static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); static pd_entry_t *pmap_pti_pde(vm_offset_t va); static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, struct rwlock **lockp); static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, vm_offset_t va); static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, vm_offset_t va); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int); static void pmap_free_pt_page(pmap_t, vm_page_t, bool); /********************/ /* Inline functions */ /********************/ /* * Return a non-clipped indexes for a given VA, which are page table * pages indexes at the corresponding level. */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } static __inline vm_pindex_t pmap_pdpe_pindex(vm_offset_t va) { return (NUPDE + (va >> PDPSHIFT)); } static __inline vm_pindex_t pmap_pml4e_pindex(vm_offset_t va) { return (NUPDE + NUPDPE + (va >> PML4SHIFT)); } static __inline vm_pindex_t pmap_pml5e_pindex(vm_offset_t va) { return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); } static __inline pml4_entry_t * pmap_pml5e(pmap_t pmap, vm_offset_t va) { MPASS(pmap_is_la57(pmap)); return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); } static __inline pml4_entry_t * pmap_pml5e_u(pmap_t pmap, vm_offset_t va) { MPASS(pmap_is_la57(pmap)); return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); } static __inline pml4_entry_t * pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) { pml4_entry_t *pml4e; /* XXX MPASS(pmap_is_la57(pmap); */ pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); return (&pml4e[pmap_pml4e_index(va)]); } /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { pml5_entry_t *pml5e; pml4_entry_t *pml4e; pt_entry_t PG_V; if (pmap_is_la57(pmap)) { pml5e = pmap_pml5e(pmap, va); PG_V = pmap_valid_bit(pmap); if ((*pml5e & PG_V) == 0) return (NULL); pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); } else { pml4e = pmap->pm_pmltop; } return (&pml4e[pmap_pml4e_index(va)]); } static __inline pml4_entry_t * pmap_pml4e_u(pmap_t pmap, vm_offset_t va) { MPASS(!pmap_is_la57(pmap)); return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) { pdp_entry_t *pdpe; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); return (&pdpe[pmap_pdpe_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); if (pml4e == NULL || (*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; KASSERT((*pdpe & PG_PS) == 0, ("%s: pdpe %#lx is a leaf", __func__, *pdpe)); pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); return (&pde[pmap_pde_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); KASSERT((*pdpe & PG_PS) == 0, ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); return (pmap_pdpe_to_pde(pdpe, va)); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; KASSERT((*pde & PG_PS) == 0, ("%s: pde %#lx is a leaf", __func__, *pde)); pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); return (&pte[pmap_pte_index(va)]); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return (NULL); if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ return ((pt_entry_t *)pde); return (pmap_pde_to_pte(pde, va)); } static __inline void pmap_resident_count_adj(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count + count >= 0, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count += count; } static __inline void pmap_pt_page_count_adj(pmap_t pmap, int count) { if (pmap == kernel_pmap) counter_u64_add(kernel_pt_page_count, count); else { if (pmap != NULL) pmap_resident_count_adj(pmap, count); counter_u64_add(user_pt_page_count, count); } } PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); if (la57) { mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); return (P5Tmap + ((va >> PAGE_SHIFT) & mask)); } else { mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); return (P4Tmap + ((va >> PAGE_SHIFT) & mask)); } } static __inline pd_entry_t * vtopde(vm_offset_t va) { u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); if (la57) { mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); return (P5Dmap + ((va >> PDRSHIFT) & mask)); } else { mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); return (P4Dmap + ((va >> PDRSHIFT) & mask)); } } static u_int64_t allocpages(vm_paddr_t *firstaddr, int n) { u_int64_t ret; ret = *firstaddr; bzero((void *)ret, n * PAGE_SIZE); *firstaddr += n * PAGE_SIZE; return (ret); } CTASSERT(powerof2(NDMPML4E)); /* number of kernel PDP slots */ #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) static void nkpt_init(vm_paddr_t addr) { int pt_pages; #ifdef NKPT pt_pages = NKPT; #else pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ pt_pages += NKPDPE(pt_pages); /* * Add some slop beyond the bare minimum required for bootstrapping * the kernel. * * This is quite important when allocating KVA for kernel modules. * The modules are required to be linked in the negative 2GB of * the address space. If we run out of KVA in this region then * pmap_growkernel() will need to allocate page table pages to map * the entire 512GB of KVA space which is an unnecessary tax on * physical memory. * * Secondly, device memory mapped as part of setting up the low- * level console(s) is taken from KVA, starting at virtual_avail. * This is because cninit() is called after pmap_bootstrap() but * before vm_init() and pmap_init(). 20MB for a frame buffer is * not uncommon. */ pt_pages += 32; /* 64MB additional slop. */ #endif nkpt = pt_pages; } /* * Returns the proper write/execute permission for a physical page that is * part of the initial boot allocations. * * If the page has kernel text, it is marked as read-only. If the page has * kernel read-only data, it is marked as read-only/not-executable. If the * page has only read-write data, it is marked as read-write/not-executable. * If the page is below/above the kernel range, it is marked as read-write. * * This function operates on 2M pages, since we map the kernel space that * way. */ static inline pt_entry_t bootaddr_rwx(vm_paddr_t pa) { /* * The kernel is loaded at a 2MB-aligned address, and memory below that * need not be executable. The .bss section is padded to a 2MB * boundary, so memory following the kernel need not be executable * either. Preloaded kernel modules have their mapping permissions * fixed up by the linker. */ if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || pa >= trunc_2mpage(kernphys + _end - KERNSTART)) return (X86_PG_RW | pg_nx); /* * The linker should ensure that the read-only and read-write * portions don't share the same 2M page, so this shouldn't * impact read-only data. However, in any case, any page with * read-write data needs to be read-write. */ if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) return (X86_PG_RW | pg_nx); /* * Mark any 2M page containing kernel text as read-only. Mark * other pages with read-only data as read-only and not executable. * (It is likely a small portion of the read-only data section will * be marked as read-only, but executable. This should be acceptable * since the read-only protection will keep the data from changing.) * Note that fixups to the .text section will still work until we * set CR0.WP. */ if (pa < round_2mpage(kernphys + etext - KERNSTART)) return (0); return (pg_nx); } static void create_pagetables(vm_paddr_t *firstaddr) { pd_entry_t *pd_p; pdp_entry_t *pdp_p; pml4_entry_t *p4_p; uint64_t DMPDkernphys; vm_paddr_t pax; #ifdef KASAN pt_entry_t *pt_p; uint64_t KASANPDphys, KASANPTphys, KASANphys; vm_offset_t kasankernbase; int kasankpdpi, kasankpdi, nkasanpte; #endif int i, j, ndm1g, nkpdpe, nkdmpde; /* Allocate page table pages for the direct map */ ndmpdp = howmany(ptoa(Maxmem), NBPDP); if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; ndmpdpphys = howmany(ndmpdp, NPDPEPG); if (ndmpdpphys > NDMPML4E) { /* * Each NDMPML4E allows 512 GB, so limit to that, * and then readjust ndmpdp and ndmpdpphys. */ printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); Maxmem = atop(NDMPML4E * NBPML4); ndmpdpphys = NDMPML4E; ndmpdp = NDMPML4E * NPDEPG; } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) { /* * Calculate the number of 1G pages that will fully fit in * Maxmem. */ ndm1g = ptoa(Maxmem) >> PDPSHIFT; /* * Allocate 2M pages for the kernel. These will be used in * place of the one or more 1G pages from ndm1g that maps * kernel memory into DMAP. */ nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + kernphys - rounddown2(kernphys, NBPDP), NBPDP); DMPDkernphys = allocpages(firstaddr, nkdmpde); } if (ndm1g < ndmpdp) DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Allocate pages. */ KPML4phys = allocpages(firstaddr, 1); KPDPphys = allocpages(firstaddr, NKPML4E); #ifdef KASAN KASANPDPphys = allocpages(firstaddr, NKASANPML4E); KASANPDphys = allocpages(firstaddr, 1); #endif #ifdef KMSAN /* * The KMSAN shadow maps are initially left unpopulated, since there is * no need to shadow memory above KERNBASE. */ KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E); KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E); #endif /* * Allocate the initial number of kernel page table pages required to * bootstrap. We defer this until after all memory-size dependent * allocations are done (e.g. direct map), so that we don't have to * build in too much slop in our estimate. * * Note that when NKPML4E > 1, we have an empty page underneath * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) * pages. (pmap_enter requires a PD page to exist for each KPML4E.) */ nkpt_init(*firstaddr); nkpdpe = NKPDPE(nkpt); KPTphys = allocpages(firstaddr, nkpt); KPDphys = allocpages(firstaddr, nkpdpe); #ifdef KASAN nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE); KASANPTphys = allocpages(firstaddr, nkasanpte); KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG); #endif /* * Connect the zero-filled PT pages to their PD entries. This * implicitly maps the PT pages at their correct locations within * the PTmap. */ pd_p = (pd_entry_t *)KPDphys; for (i = 0; i < nkpt; i++) pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* * Map from start of the kernel in physical memory (staging * area) to the end of loader preallocated memory using 2MB * pages. This replaces some of the PD entries created above. * For compatibility, identity map 2M at the start. */ pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | X86_PG_RW | pg_nx; for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { /* Preset PG_M and PG_A because demotion expects it. */ pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | bootaddr_rwx(pax); } /* * Because we map the physical blocks in 2M pages, adjust firstaddr * to record the physical blocks we've actually mapped into kernel * virtual address space. */ if (*firstaddr < round_2mpage(KERNend)) *firstaddr = round_2mpage(KERNend); /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); for (i = 0; i < nkpdpe; i++) pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; #ifdef KASAN kasankernbase = kasan_md_addr_to_shad(KERNBASE); kasankpdpi = pmap_pdpe_index(kasankernbase); kasankpdi = pmap_pde_index(kasankernbase); pdp_p = (pdp_entry_t *)KASANPDPphys; pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx); pd_p = (pd_entry_t *)KASANPDphys; for (i = 0; i < nkasanpte; i++) pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V | pg_nx; pt_p = (pt_entry_t *)KASANPTphys; for (i = 0; i < nkasanpte * NPTEPG; i++) pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | pg_nx; #endif /* * Now, set up the direct map region using 2MB and/or 1GB pages. If * the end of physical memory is not aligned to a 1GB page boundary, * then the residual physical memory is mapped with 2MB pages. Later, * if pmap_mapdev{_attr}() uses the direct map for non-write-back * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings * that are partially used. */ pd_p = (pd_entry_t *)DMPDphys; for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; } /* * Instead of using a 1G page for the memory containing the kernel, * use 2M pages with read-only and no-execute permissions. (If using 1G * pages, this will partially overwrite the PDPEs above.) */ if (ndm1g > 0) { pd_p = (pd_entry_t *)DMPDkernphys; for (i = 0, pax = rounddown2(kernphys, NBPDP); i < NPDEPG * nkdmpde; i++, pax += NBPDR) { pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx | bootaddr_rwx(pax); } j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; for (i = 0; i < nkdmpde; i++) { pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | X86_PG_V | pg_nx; } } /* And recursively map PML4 to itself in order to get PTmap */ p4_p = (pml4_entry_t *)KPML4phys; p4_p[PML4PML4I] = KPML4phys; p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; #ifdef KASAN /* Connect the KASAN shadow map slots up to the PML4. */ for (i = 0; i < NKASANPML4E; i++) { p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i); p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; } #endif #ifdef KMSAN /* Connect the KMSAN shadow map slots up to the PML4. */ for (i = 0; i < NKMSANSHADPML4E; i++) { p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i); p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; } /* Connect the KMSAN origin map slots up to the PML4. */ for (i = 0; i < NKMSANORIGPML4E; i++) { p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i); p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; } #endif /* Connect the Direct Map slots up to the PML4. */ for (i = 0; i < ndmpdpphys; i++) { p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; } /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); } /* * Bootstrap the system enough to run with virtual memory. * * On amd64 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t *firstaddr) { vm_offset_t va; pt_entry_t *pte, *pcpu_pte; struct region_descriptor r_gdt; uint64_t cr4, pcpu_phys; u_long res; int i; KERNend = *firstaddr; res = atop(KERNend - (vm_paddr_t)kernphys); if (!pti) pg_g = X86_PG_G; /* * Create an initial set of page tables to run the kernel in. */ create_pagetables(firstaddr); pcpu_phys = allocpages(firstaddr, MAXCPU); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Account for the virtual addresses mapped by create_pagetables(). */ virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - (vm_paddr_t)kernphys); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Enable PG_G global pages, then switch to the kernel page * table from the bootstrap page table. After the switch, it * is possible to enable SMEP and SMAP since PG_U bits are * correct now. */ cr4 = rcr4(); cr4 |= CR4_PGE; load_cr4(cr4); load_cr3(KPML4phys); if (cpu_stdext_feature & CPUID_STDEXT_SMEP) cr4 |= CR4_SMEP; if (cpu_stdext_feature & CPUID_STDEXT_SMAP) cr4 |= CR4_SMAP; load_cr4(cr4); /* * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pmltop = kernel_pml4; kernel_pmap->pm_cr3 = KPML4phys; kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_stats.resident_count = res; kernel_pmap->pm_flags = pmap_flags; /* * Initialize the TLB invalidations generation number lock. */ mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Crashdump maps. The first page is reused as CMAP1 for the * memory test. */ SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) CADDR1 = crashdumpmap; SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); virtual_avail = va; for (i = 0; i < MAXCPU; i++) { pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | pg_g | pg_nx | X86_PG_M | X86_PG_A; } /* * Re-initialize PCPU area for BSP after switching. * Make hardware use gdt and common_tss from the new PCPU. */ STAILQ_INIT(&cpuhead); wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); amd64_bsp_pcpu_init1(&__pcpu[0]); amd64_bsp_ist_init(&__pcpu[0]); __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * sizeof(struct user_segment_descriptor)); gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; r_gdt.rd_base = (long)__pcpu[0].pc_gdt; lgdt(&r_gdt); wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); ltr(GSEL(GPROC0_SEL, SEL_KPL)); __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; /* * Initialize the PAT MSR. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. */ pmap_init_pat(); /* Initialize TLB Context Id. */ if (pmap_pcid_enabled) { for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; kernel_pmap->pm_pcids[i].pm_gen = 1; } /* * PMAP_PCID_KERN + 1 is used for initialization of * proc0 pmap. The pmap' pcid state might be used by * EFIRT entry before first context switch, so it * needs to be valid. */ PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); PCPU_SET(pcid_gen, 1); /* * pcpu area for APs is zeroed during AP startup. * pc_pcid_next and pc_pcid_gen are initialized by AP * during pcpu setup. */ load_cr4(rcr4() | CR4_PCIDE); } } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { uint64_t pat_msr; u_long cr0, cr4; int i; /* Bail if this CPU doesn't implement PAT. */ if ((cpu_feature & CPUID_PAT) == 0) panic("no PAT??"); /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = -1; pat_index[PAT_WRITE_BACK] = 0; pat_index[PAT_WRITE_THROUGH] = 1; pat_index[PAT_UNCACHEABLE] = 3; pat_index[PAT_WRITE_COMBINING] = 6; pat_index[PAT_WRITE_PROTECTED] = 5; pat_index[PAT_UNCACHED] = 2; /* * Initialize default PAT entries. * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * * Leave 4 and 7 as WB and UC. Note that a recursive page table * mapping for a 2M page uses a PAT value with the bit 3 set due * to its overload with PG_PS. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING) | PAT_VALUE(7, PAT_UNCACHEABLE); /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } vm_page_t pmap_page_alloc_below_4g(bool zeroed) { vm_page_t m; m = vm_page_alloc_contig(NULL, 0, (zeroed ? VM_ALLOC_ZERO : 0) | VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ, 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if (m != NULL && zeroed && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } extern const char la57_trampoline[], la57_trampoline_gdt_desc[], la57_trampoline_gdt[], la57_trampoline_end[]; static void pmap_bootstrap_la57(void *arg __unused) { char *v_code; pml5_entry_t *v_pml5; pml4_entry_t *v_pml4; pdp_entry_t *v_pdp; pd_entry_t *v_pd; pt_entry_t *v_pt; vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; void (*la57_tramp)(uint64_t pml5); struct region_descriptor r_gdt; if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) return; TUNABLE_INT_FETCH("vm.pmap.la57", &la57); if (!la57) return; r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; r_gdt.rd_base = (long)__pcpu[0].pc_gdt; m_code = pmap_page_alloc_below_4g(true); v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); m_pml5 = pmap_page_alloc_below_4g(true); KPML5phys = VM_PAGE_TO_PHYS(m_pml5); v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); m_pml4 = pmap_page_alloc_below_4g(true); v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); m_pdp = pmap_page_alloc_below_4g(true); v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); m_pd = pmap_page_alloc_below_4g(true); v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); m_pt = pmap_page_alloc_below_4g(true); v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); /* * Map m_code 1:1, it appears below 4G in KVA due to physical * address being below 4G. Since kernel KVA is in upper half, * the pml4e should be zero and free for temporary use. */ kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; /* * Add pml5 entry at top of KVA pointing to existing pml4 table, * entering all existing kernel mappings into level 5 table. */ v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; /* * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. */ v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; /* * Copy and call the 48->57 trampoline, hope we return there, alive. */ bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); invlpg((vm_offset_t)la57_tramp); la57_tramp(KPML5phys); /* * gdt was necessary reset, switch back to our gdt. */ lgdt(&r_gdt); wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); ltr(GSEL(GPROC0_SEL, SEL_KPL)); /* * Now unmap the trampoline, and free the pages. * Clear pml5 entry used for 1:1 trampoline mapping. */ pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); invlpg((vm_offset_t)v_code); vm_page_free(m_code); vm_page_free(m_pdp); vm_page_free(m_pd); vm_page_free(m_pt); /* * Recursively map PML5 to itself in order to get PTmap and * PDmap. */ v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; kernel_pmap->pm_cr3 = KPML5phys; kernel_pmap->pm_pmltop = v_pml5; pmap_pt_page_count_adj(kernel_pmap, 1); } SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } static int pmap_allow_2m_x_ept; SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &pmap_allow_2m_x_ept, 0, "Allow executable superpage mappings in EPT"); void pmap_allow_2m_x_ept_recalculate(void) { /* * SKL002, SKL012S. Since the EPT format is only used by * Intel CPUs, the vendor check is merely a formality. */ if (!(cpu_vendor_id != CPU_VENDOR_INTEL || (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || (CPUID_TO_FAMILY(cpu_id) == 0x6 && (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ CPUID_TO_MODEL(cpu_id) == 0x27 || CPUID_TO_MODEL(cpu_id) == 0x35 || CPUID_TO_MODEL(cpu_id) == 0x36 || CPUID_TO_MODEL(cpu_id) == 0x37 || CPUID_TO_MODEL(cpu_id) == 0x86 || CPUID_TO_MODEL(cpu_id) == 0x1c || CPUID_TO_MODEL(cpu_id) == 0x4a || CPUID_TO_MODEL(cpu_id) == 0x4c || CPUID_TO_MODEL(cpu_id) == 0x4d || CPUID_TO_MODEL(cpu_id) == 0x5a || CPUID_TO_MODEL(cpu_id) == 0x5c || CPUID_TO_MODEL(cpu_id) == 0x5d || CPUID_TO_MODEL(cpu_id) == 0x5f || CPUID_TO_MODEL(cpu_id) == 0x6e || CPUID_TO_MODEL(cpu_id) == 0x7a || CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ CPUID_TO_MODEL(cpu_id) == 0x85)))) pmap_allow_2m_x_ept = 1; TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); } static bool pmap_allow_2m_x_page(pmap_t pmap, bool executable) { return (pmap->pm_type != PT_EPT || !executable || !pmap_allow_2m_x_ept); } #ifdef NUMA static void pmap_init_pv_table(void) { struct pmap_large_md_page *pvd; vm_size_t s; long start, end, highest, pv_npg; int domain, i, j, pages; /* * We strongly depend on the size being a power of two, so the assert * is overzealous. However, should the struct be resized to a * different power of two, the code below needs to be revisited. */ CTASSERT((sizeof(*pvd) == 64)); /* * Calculate the size of the array. */ pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end; pv_npg = howmany(pmap_last_pa, NBPDR); s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); s = round_page(s); pv_table = (struct pmap_large_md_page *)kva_alloc(s); if (pv_table == NULL) panic("%s: kva_alloc failed\n", __func__); /* * Iterate physical segments to allocate space for respective pages. */ highest = -1; s = 0; for (i = 0; i < vm_phys_nsegs; i++) { end = vm_phys_segs[i].end / NBPDR; domain = vm_phys_segs[i].domain; if (highest >= end) continue; start = highest + 1; pvd = &pv_table[start]; pages = end - start + 1; s = round_page(pages * sizeof(*pvd)); highest = start + (s / sizeof(*pvd)) - 1; for (j = 0; j < s; j += PAGE_SIZE) { - vm_page_t m = vm_page_alloc_domain(NULL, 0, - domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); + vm_page_t m = vm_page_alloc_noobj_domain(domain, 0); if (m == NULL) - panic("vm_page_alloc_domain failed for %lx\n", (vm_offset_t)pvd + j); + panic("failed to allocate PV table page"); pmap_qenter((vm_offset_t)pvd + j, &m, 1); } for (j = 0; j < s / sizeof(*pvd); j++) { rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); TAILQ_INIT(&pvd->pv_page.pv_list); pvd->pv_page.pv_gen = 0; pvd->pv_page.pat_mode = 0; pvd->pv_invl_gen = 0; pvd++; } } pvd = &pv_dummy_large; rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); TAILQ_INIT(&pvd->pv_page.pv_list); pvd->pv_page.pv_gen = 0; pvd->pv_page.pat_mode = 0; pvd->pv_invl_gen = 0; } #else static void pmap_init_pv_table(void) { vm_size_t s; long i, pv_npg; /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)pv_npg * sizeof(struct md_page); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); } #endif /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t m, mpte; int error, i, ret, skz63; /* L1TF, reserve page @0 unconditionally */ vm_page_blacklist_add(0, bootverbose); /* Detect bare-metal Skylake Server and Skylake-X. */ if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { /* * Skylake-X errata SKZ63. Processor May Hang When * Executing Code In an HLE Transaction Region between * 40000000H and 403FFFFFH. * * Mark the pages in the range as preallocated. It * seems to be impossible to distinguish between * Skylake Server and Skylake X. */ skz63 = 1; TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); if (skz63 != 0) { if (bootverbose) printf("SKZ63: skipping 4M RAM starting " "at physical 1G\n"); for (i = 0; i < atop(0x400000); i++) { ret = vm_page_blacklist_add(0x40000000 + ptoa(i), FALSE); if (!ret && bootverbose) printf("page at %#lx already used\n", 0x40000000 + ptoa(i)); } } } /* IFU */ pmap_allow_2m_x_ept_recalculate(); /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ PMAP_LOCK(kernel_pmap); for (i = 0; i < nkpt; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = pmap_pde_pindex(KERNBASE) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); mpte->ref_count = 1; /* * Collect the page table pages that were replaced by a 2MB * page in create_pagetables(). They are zero filled. */ if ((i == 0 || kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && pmap_insert_pt_page(kernel_pmap, mpte, false)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); vm_wire_add(nkpt); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; if ((amd_feature & AMDID_PAGE1GB) != 0) { KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, ("pmap_init: can't assign to pagesizes[2]")); pagesizes[2] = NBPDP; } } /* * Initialize pv chunk lists. */ for (i = 0; i < PMAP_MEMDOM; i++) { mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); TAILQ_INIT(&pv_chunks[i].pvc_list); } pmap_init_pv_table(); pmap_initialized = 1; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; /* Make the direct map consistent */ if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), ppim->sz, ppim->mode); } if (!bootverbose) continue; printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, ppim->pa, ppim->va, ppim->sz, ppim->mode); } mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, (vmem_addr_t *)&qframe); if (error != 0) panic("qframe allocation failed"); lm_ents = 8; TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); if (lm_ents > LMEPML4I - LMSPML4I + 1) lm_ents = LMEPML4I - LMSPML4I + 1; #ifdef KMSAN if (lm_ents > KMSANORIGPML4I - LMSPML4I) { printf( "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", lm_ents, KMSANORIGPML4I - LMSPML4I); lm_ents = KMSANORIGPML4I - LMSPML4I; } #endif if (bootverbose) printf("pmap: large map %u PML4 slots (%lu GB)\n", lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); if (lm_ents != 0) { large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); if (large_vmem == NULL) { printf("pmap: cannot create large map\n"); lm_ents = 0; } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); /* XXXKIB la57 */ kernel_pml4[LMSPML4I + i] = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | VM_PAGE_TO_PHYS(m); } } } SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, "Maximum number of PML4 entries for use by large map (tunable). " "Each entry corresponds to 512GB of address space."); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "2MB page mapping counters"); static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions); SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions"); static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings); SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, "2MB page mappings"); static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures); SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, "2MB page promotion failures"); static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions); SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, "2MB page promotions"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "1GB page mapping counters"); static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions); SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pdpe_demotions, "1GB page demotions"); /*************************************************** * Low level helper routines..... ***************************************************/ static pt_entry_t pmap_swap_pat(pmap_t pmap, pt_entry_t entry) { int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* Verify that both PAT bits are not set at the same time */ KASSERT((entry & x86_pat_bits) != x86_pat_bits, ("Invalid PAT bits in entry %#lx", entry)); /* Swap the PAT bits if one of them is set */ if ((entry & x86_pat_bits) != 0) entry ^= x86_pat_bits; break; case PT_EPT: /* * Nothing to do - the memory attributes are represented * the same way for regular pages and superpages. */ break; default: panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); } return (entry); } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= 0 && mode < PAT_INDEX_SIZE && pat_index[(int)mode] >= 0); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (!pmap_is_valid_memattr(pmap, mode)) panic("Unknown caching mode %d\n", mode); switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; break; case PT_EPT: cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); break; default: panic("unsupported pmap type %d", pmap->pm_type); } return (cache_bits); } static int pmap_cache_mask(pmap_t pmap, boolean_t is_pde) { int mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; break; case PT_EPT: mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); break; default: panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); } return (mask); } static int pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) { int pat_flag, pat_idx; pat_idx = 0; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; if ((pte & pat_flag) != 0) pat_idx |= 0x4; if ((pte & PG_NC_PCD) != 0) pat_idx |= 0x2; if ((pte & PG_NC_PWT) != 0) pat_idx |= 0x1; break; case PT_EPT: if ((pte & EPT_PG_IGNORE_PAT) != 0) panic("EPT PTE %#lx has no PAT memory type", pte); pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; break; } /* See pmap_init_pat(). */ if (pat_idx == 4) pat_idx = 0; if (pat_idx == 7) pat_idx = 3; return (pat_idx); } bool pmap_ps_enabled(pmap_t pmap) { return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); } static void pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) { switch (pmap->pm_type) { case PT_X86: break; case PT_RVI: case PT_EPT: /* * XXX * This is a little bogus since the generation number is * supposed to be bumped up when a region of the address * space is invalidated in the page tables. * * In this case the old PDE entry is valid but yet we want * to make sure that any mappings using the old entry are * invalidated in the TLB. * * The reason this works as expected is because we rendezvous * "all" host cpus and force any vcpu context to exit as a * side-effect. */ atomic_add_long(&pmap->pm_eptgen, 1); break; default: panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); } pde_store(pde, newpde); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) { pt_entry_t PG_G; if (pmap_type_guest(pmap)) return; KASSERT(pmap->pm_type == PT_X86, ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); PG_G = pmap_global_bit(pmap); if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ invltlb_glob(); } } /* * The amd64 pmap uses different approaches to TLB invalidation * depending on the kernel configuration, available hardware features, * and known hardware errata. The kernel configuration option that * has the greatest operational impact on TLB invalidation is PTI, * which is enabled automatically on affected Intel CPUs. The most * impactful hardware features are first PCID, and then INVPCID * instruction presence. PCID usage is quite different for PTI * vs. non-PTI. * * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate * the Meltdown bug in some Intel CPUs. Under PTI, each user address * space is served by two page tables, user and kernel. The user * page table only maps user space and a kernel trampoline. The * kernel trampoline includes the entirety of the kernel text but * only the kernel data that is needed to switch from user to kernel * mode. The kernel page table maps the user and kernel address * spaces in their entirety. It is identical to the per-process * page table used in non-PTI mode. * * User page tables are only used when the CPU is in user mode. * Consequently, some TLB invalidations can be postponed until the * switch from kernel to user mode. In contrast, the user * space part of the kernel page table is used for copyout(9), so * TLB invalidations on this page table cannot be similarly postponed. * * The existence of a user mode page table for the given pmap is * indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in * which case pm_ucr3 contains the %cr3 register value for the user * mode page table's root. * * * The pm_active bitmask indicates which CPUs currently have the * pmap active. A CPU's bit is set on context switch to the pmap, and * cleared on switching off this CPU. For the kernel page table, * the pm_active field is immutable and contains all CPUs. The * kernel page table is always logically active on every processor, * but not necessarily in use by the hardware, e.g., in PTI mode. * * When requesting invalidation of virtual addresses with * pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to * all CPUs recorded as active in pm_active. Updates to and reads * from pm_active are not synchronized, and so they may race with * each other. Shootdown handlers are prepared to handle the race. * * * PCID is an optional feature of the long mode x86 MMU where TLB * entries are tagged with the 'Process ID' of the address space * they belong to. This feature provides a limited namespace for * process identifiers, 12 bits, supporting 4095 simultaneous IDs * total. * * Allocation of a PCID to a pmap is done by an algorithm described * in section 15.12, "Other TLB Consistency Algorithms", of * Vahalia's book "Unix Internals". A PCID cannot be allocated for * the whole lifetime of a pmap in pmap_pinit() due to the limited * namespace. Instead, a per-CPU, per-pmap PCID is assigned when * the CPU is about to start caching TLB entries from a pmap, * i.e., on the context switch that activates the pmap on the CPU. * * The PCID allocator maintains a per-CPU, per-pmap generation * count, pm_gen, which is incremented each time a new PCID is * allocated. On TLB invalidation, the generation counters for the * pmap are zeroed, which signals the context switch code that the * previously allocated PCID is no longer valid. Effectively, * zeroing any of these counters triggers a TLB shootdown for the * given CPU/address space, due to the allocation of a new PCID. * * Zeroing can be performed remotely. Consequently, if a pmap is * inactive on a CPU, then a TLB shootdown for that pmap and CPU can * be initiated by an ordinary memory access to reset the target * CPU's generation count within the pmap. The CPU initiating the * TLB shootdown does not need to send an IPI to the target CPU. * * * PTI + PCID. The available PCIDs are divided into two sets: PCIDs * for complete (kernel) page tables, and PCIDs for user mode page * tables. A user PCID value is obtained from the kernel PCID value * by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT). * * User space page tables are activated on return to user mode, by * loading pm_ucr3 into %cr3. If the PCPU(ucr3_load_mask) requests * clearing bit 63 of the loaded ucr3, this effectively causes * complete invalidation of the user mode TLB entries for the * current pmap. In which case, local invalidations of individual * pages in the user page table are skipped. * * * Local invalidation, all modes. If the requested invalidation is * for a specific address or the total invalidation of a currently * active pmap, then the TLB is flushed using INVLPG for a kernel * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a * user space page table(s). * * If the INVPCID instruction is available, it is used to flush entries * from the kernel page table. * * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its * address space, all other 4095 PCIDs are used for user mode spaces * as described above. A context switch allocates a new PCID if * the recorded PCID is zero or the recorded generation does not match * the CPU's generation, effectively flushing the TLB for this address space. * Total remote invalidation is performed by zeroing pm_gen for all CPUs. * local user page: INVLPG * local kernel page: INVLPG * local user total: INVPCID(CTX) * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() * remote user page, inactive pmap: zero pm_gen * remote user page, active pmap: zero pm_gen + IPI:INVLPG * (Both actions are required to handle the aforementioned pm_active races.) * remote kernel page: IPI:INVLPG * remote user total, inactive pmap: zero pm_gen * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or * reload %cr3) * (See note above about pm_active races.) * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) * * PTI enabled, PCID present. * local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3) * for upt * local kernel page: INVLPG * local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE * on loading UCR3 into %cr3 for upt * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() * remote user page, inactive pmap: zero pm_gen * remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt, * INVPCID(ADDR) for upt) * remote kernel page: IPI:INVLPG * remote user total, inactive pmap: zero pm_gen * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt, * clear PCID_SAVE on loading UCR3 into $cr3 for upt) * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) * * No PCID. * local user page: INVLPG * local kernel page: INVLPG * local user total: reload %cr3 * local kernel total: invltlb_glob() * remote user page, inactive pmap: - * remote user page, active pmap: IPI:INVLPG * remote kernel page: IPI:INVLPG * remote user total, inactive pmap: - * remote user total, active pmap: IPI:(reload %cr3) * remote kernel total: IPI:invltlb_glob() * Since on return to user mode, the reload of %cr3 with ucr3 causes * TLB invalidation, no specific action is required for user page table. * * EPT. EPT pmaps do not map KVA, all mappings are userspace. * XXX TODO */ #ifdef SMP /* * Interrupt the cpus that are executing in the guest context. * This will force the vcpu to exit and the cached EPT mappings * will be invalidated by the host before the next vmresume. */ static __inline void pmap_invalidate_ept(pmap_t pmap) { smr_seq_t goal; int ipinum; sched_pin(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("pmap_invalidate_ept: absurd pm_active")); /* * The TLB mappings associated with a vcpu context are not * flushed each time a different vcpu is chosen to execute. * * This is in contrast with a process's vtop mappings that * are flushed from the TLB on each context switch. * * Therefore we need to do more than just a TLB shootdown on * the active cpus in 'pmap->pm_active'. To do this we keep * track of the number of invalidations performed on this pmap. * * Each vcpu keeps a cache of this counter and compares it * just before a vmresume. If the counter is out-of-date an * invept will be done to flush stale mappings from the TLB. * * To ensure that all vCPU threads have observed the new counter * value before returning, we use SMR. Ordering is important here: * the VMM enters an SMR read section before loading the counter * and after updating the pm_active bit set. Thus, pm_active is * a superset of active readers, and any reader that has observed * the goal has observed the new counter value. */ atomic_add_long(&pmap->pm_eptgen, 1); goal = smr_advance(pmap->pm_eptsmr); /* * Force the vcpu to exit and trap back into the hypervisor. */ ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; ipi_selected(pmap->pm_active, ipinum); sched_unpin(); /* * Ensure that all active vCPUs will observe the new generation counter * value before executing any more guest instructions. */ smr_wait(pmap->pm_eptsmr, goal); } static cpuset_t pmap_invalidate_cpu_mask(pmap_t pmap) { return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); } static inline void pmap_invalidate_preipi_pcid(pmap_t pmap) { u_int cpuid, i; sched_pin(); cpuid = PCPU_GET(cpuid); if (pmap != PCPU_GET(curpmap)) cpuid = 0xffffffff; /* An impossible value */ CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } /* * The fence is between stores to pm_gen and the read of the * pm_active mask. We need to ensure that it is impossible * for us to miss the bit update in pm_active and * simultaneously observe a non-zero pm_gen in * pmap_activate_sw(), otherwise TLB update is missed. * Without the fence, IA32 allows such an outcome. Note that * pm_active is updated by a locked operation, which provides * the reciprocal fence. */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) { sched_pin(); } DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t)) { return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : pmap_invalidate_preipi_nopcid); } static inline void pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid; /* * Because pm_pcid is recalculated on a context switch, we * must ensure there is no preemption, not just pinning. * Otherwise, we might use a stale value below. */ CRITICAL_ASSERT(curthread); /* * No need to do anything with user page tables invalidation * if there is no user page table, or invalidation is deferred * until the return to userspace. ucr3_load_mask is stable * because we have preemption disabled. */ if (pmap->pm_ucr3 == PMAP_NO_CR3 || PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) return; cpuid = PCPU_GET(cpuid); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = va; invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, va); } } static void pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_pcid_cb(pmap, va, true); } static void pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_pcid_cb(pmap, va, false); } static void pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) { } DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : pmap_invalidate_page_pcid_noinvpcid_cb); return (pmap_invalidate_page_nopcid_cb); } static void pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, vm_offset_t addr2 __unused) { if (pmap == kernel_pmap) { invlpg(va); } else if (pmap == PCPU_GET(curpmap)) { invlpg(va); pmap_invalidate_page_cb(pmap, va); } } void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); pmap_invalidate_preipi(pmap); smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap, pmap_invalidate_page_curcpu_cb); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) static void pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid; CRITICAL_ASSERT(curthread); if (pmap != PCPU_GET(curpmap) || pmap->pm_ucr3 == PMAP_NO_CR3 || PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) return; cpuid = PCPU_GET(cpuid); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } } static void pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); } static void pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); } static void pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, vm_offset_t eva __unused) { } DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, vm_offset_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : pmap_invalidate_range_pcid_noinvpcid_cb); return (pmap_invalidate_range_nopcid_cb); } static void pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } else if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); pmap_invalidate_range_cb(pmap, sva, eva); } } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); pmap_invalidate_preipi(pmap); smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap, pmap_invalidate_range_curcpu_cb); } static inline void pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3; uint32_t pcid; u_int cpuid; if (pmap == kernel_pmap) { if (invpcid_works1) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else if (pmap == PCPU_GET(curpmap)) { CRITICAL_ASSERT(curthread); cpuid = PCPU_GET(cpuid); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); } else { kcr3 = pmap->pm_cr3 | pcid; load_cr3(kcr3); } if (pmap->pm_ucr3 != PMAP_NO_CR3) PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); } } static void pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) { pmap_invalidate_all_pcid_cb(pmap, true); } static void pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) { pmap_invalidate_all_pcid_cb(pmap, false); } static void pmap_invalidate_all_nopcid_cb(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (pmap == PCPU_GET(curpmap)) invltlb(); } DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : pmap_invalidate_all_pcid_noinvpcid_cb); return (pmap_invalidate_all_nopcid_cb); } static void pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, vm_offset_t addr2 __unused) { pmap_invalidate_all_cb(pmap); } void pmap_invalidate_all(pmap_t pmap) { if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); pmap_invalidate_preipi(pmap); smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap, pmap_invalidate_all_curcpu_cb); } static void pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, vm_offset_t addr2 __unused) { wbinvd(); } void pmap_invalidate_cache(void) { sched_pin(); smp_cache_flush(pmap_invalidate_cache_curcpu_cb); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ pmap_t pmap; vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_action(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pmap_update_pde_store(act->pmap, act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap || pmap_type_guest(pmap)) active = all_cpus; else { active = pmap->pm_active; } if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pmap = pmap; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendezvous_barrier, pmap_update_pde_action, pmap_update_pde_teardown, &act); } else { pmap_update_pde_store(pmap, pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(pmap, va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, invalidation functions. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { invlpg(va); if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); pcid = pmap->pm_pcids[0].pm_pcid; if (invpcid_works) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = va; invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, va); } critical_exit(); } } else if (pmap_pcid_enabled) pmap->pm_pcids[0].pm_gen = 0; } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct invpcid_descr d; vm_offset_t addr; uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = sva; for (; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. pm_pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } critical_exit(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } void pmap_invalidate_all(pmap_t pmap) { struct invpcid_descr d; uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap) { if (pmap_pcid_enabled && invpcid_works) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); if (pmap->pm_ucr3 != PMAP_NO_CR3) { d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); } } else { kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; if (pmap->pm_ucr3 != PMAP_NO_CR3) { ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 0].pm_pcid | PMAP_PCID_USER_PT; pmap_pti_pcid_invalidate(ucr3, kcr3); } else load_cr3(kcr3); } critical_exit(); } else { invltlb(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { pmap_update_pde_store(pmap, pde, newpde); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) pmap_update_pde_invalidate(pmap, va, newpde); else pmap->pm_pcids[0].pm_gen = 0; } #endif /* !SMP */ static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2MB page mapping was created * by a promotion that did not invalidate the 512 4KB page mappings * that might exist in the TLB. Consequently, at this point, the TLB * may hold both 4KB and 2MB page mappings for the address range [va, * va + NBPDR). Therefore, the entire range must be invalidated here. * In contrast, when PG_PROMOTED is clear, the TLB will not hold any * 4KB page mappings for the address range [va, va + NBPDR), and so a * single INVLPG suffices to invalidate the 2MB page mapping from the * TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range(pmap, va, va + NBPDR - 1); else pmap_invalidate_page(pmap, va); } DEFINE_IFUNC(, void, pmap_invalidate_cache_range, (vm_offset_t sva, vm_offset_t eva)) { if ((cpu_feature & CPUID_SS) != 0) return (pmap_invalidate_cache_range_selfsnoop); if ((cpu_feature & CPUID_CLFSH) != 0) return (pmap_force_invalidate_cache_range); return (pmap_invalidate_cache_range_all); } #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) static void pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); } void pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) { sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC range. The * local APIC is always uncached, so we don't need to flush * for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { /* * Do per-cache line flush. Use a locked * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ atomic_thread_fence_seq_cst(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); atomic_thread_fence_seq_cst(); } else { /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); pmap_invalidate_cache(); } /* * Remove the specified set of pages from the data and instruction caches. * * In contrast to pmap_invalidate_cache_range(), this function does not * rely on the CPU's self-snoop feature, because it is intended for use * when moving pages into a different cache domain. */ void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { vm_offset_t daddr, eva; int i; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) pmap_invalidate_cache(); else { if (useclflushopt) atomic_thread_fence_seq_cst(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (i = 0; i < count; i++) { daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); eva = daddr + PAGE_SIZE; for (; daddr < eva; daddr += cpu_clflush_line_size) { if (useclflushopt) clflushopt(daddr); else clflush(daddr); } } if (useclflushopt) atomic_thread_fence_seq_cst(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } void pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { pmap_force_invalidate_cache_range(sva, eva); return; } /* See comment in pmap_force_invalidate_cache_range(). */ if (pmap_kextract(sva) == lapic_paddr) return; atomic_thread_fence_seq_cst(); for (; sva < eva; sva += cpu_clflush_line_size) clwb(sva); atomic_thread_fence_seq_cst(); } void pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) { pt_entry_t *pte; vm_offset_t vaddr; int error, pte_bits; KASSERT((spa & PAGE_MASK) == 0, ("pmap_flush_cache_phys_range: spa not page-aligned")); KASSERT((epa & PAGE_MASK) == 0, ("pmap_flush_cache_phys_range: epa not page-aligned")); if (spa < dmaplimit) { pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( dmaplimit, epa))); if (dmaplimit >= epa) return; spa = dmaplimit; } pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | X86_PG_V; error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); pte = vtopte(vaddr); for (; spa < epa; spa += PAGE_SIZE) { sched_pin(); pte_store(pte, spa | pte_bits); invlpg(vaddr); /* XXXKIB atomic inside flush_cache_range are excessive */ pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); sched_unpin(); } vmem_free(kernel_arena, vaddr, PAGE_SIZE); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_paddr_t pa; pa = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { if ((*pdpe & PG_PS) != 0) pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); else { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & PG_V) != 0) { if ((*pde & PG_PS) != 0) { pa = (*pde & PG_PS_FRAME) | (va & PDRMASK); } else { pte = pmap_pde_to_pte(pde, va); pa = (*pte & PG_FRAME) | (va & PAGE_MASK); } } } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pdp_entry_t pdpe, *pdpep; pd_entry_t pde, *pdep; pt_entry_t pte, PG_RW, PG_V; vm_page_t m; m = NULL; PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpep = pmap_pdpe(pmap, va); if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) goto out; if ((pdpe & PG_PS) != 0) { if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) goto out; m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK)); goto check_page; } pdep = pmap_pdpe_to_pde(pdpep, va); if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) goto out; if ((pde & PG_PS) != 0) { if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) goto out; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); goto check_page; } pte = *pmap_pde_to_pte(pdep, va); if ((pte & PG_V) == 0 || ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)) goto out; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); check_page: if (m != NULL && !vm_page_wire_mapped(m)) m = NULL; out: PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { pa = pmap_large_map_kextract(va); } else { pde = *vtopde(va); if (pde & PG_PS) { pa = (pde & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pa = *pmap_pde_to_pte(&pde, va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; int cache_bits; pte = vtopte(va); cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx | cache_bits); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; int cache_bits; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); pa = VM_PAGE_TO_PHYS(m) | cache_bits; if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V); } pte++; } if (__predict_false((oldpte & X86_PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); } /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { pml5_entry_t *pml5; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; vm_page_t pdpg, pdppg, pml4pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { /* PML4 page */ MPASS(pmap_is_la57(pmap)); pml5 = pmap_pml5e(pmap, va); *pml5 = 0; if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { pml5 = pmap_pml5e_u(pmap, va); *pml5 = 0; } } else if (m->pindex >= NUPDE + NUPDPE) { /* PDP page */ pml4 = pmap_pml4e(pmap, va); *pml4 = 0; if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { pml4 = pmap_pml4e_u(pmap, va); *pml4 = 0; } } else if (m->pindex >= NUPDE) { /* PD page */ pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ pd = pmap_pde(pmap, va); *pd = 0; } if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } else if (m->pindex < NUPDE + NUPDPE) { /* We just released a PD, unhold the matching PDP */ pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { /* We just released a PDP, unhold the matching PML4 */ pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pml4pg, free); } pmap_pt_page_count_adj(pmap, -1); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, va, mpte, free)); } /* * Release a page table page reference after a failed attempt to create a * mapping. */ static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { struct spglist free; SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { /* * Although "va" was never mapped, paging-structure caches * could nonetheless have entries that refer to the freed * page table pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } } void pmap_pinit0(pmap_t pmap) { struct proc *p; struct thread *td; int i; PMAP_LOCK_INIT(pmap); pmap->pm_pmltop = kernel_pmap->pm_pmltop; pmap->pm_pmltopu = NULL; pmap->pm_cr3 = kernel_pmap->pm_cr3; /* hack to keep pmap_pti_pcid_invalidate() alive */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = pmap_flags; CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; pmap->pm_pcids[i].pm_gen = 1; } pmap_activate_boot(pmap); td = curthread; if (pti) { p = td->td_proc; PROC_LOCK(p); p->p_md.md_flags |= P_MD_KPTI; PROC_UNLOCK(p); } pmap_thread_init_invl_gen(td); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } } void pmap_pinit_pml4(vm_page_t pml4pg) { pml4_entry_t *pm_pml4; int i; pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); /* Wire in kernel global address entries. */ for (i = 0; i < NKPML4E; i++) { pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } #ifdef KASAN for (i = 0; i < NKASANPML4E; i++) { pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V | pg_nx; } #endif #ifdef KMSAN for (i = 0; i < NKMSANSHADPML4E; i++) { pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V | pg_nx; } for (i = 0; i < NKMSANORIGPML4E; i++) { pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V | pg_nx; } #endif for (i = 0; i < ndmpdpphys; i++) { pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } /* install self-referential address mapping entry(s) */ pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; /* install large map entries if configured */ for (i = 0; i < lm_ents; i++) pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; } void pmap_pinit_pml5(vm_page_t pml5pg) { pml5_entry_t *pm_pml5; pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); /* * Add pml5 entry at top of KVA pointing to existing pml4 table, * entering all existing kernel mappings into level 5 table. */ pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); /* * Install self-referential address mapping entry. */ pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); } static void pmap_pinit_pml4_pti(vm_page_t pml4pgu) { pml4_entry_t *pm_pml4u; int i; pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); for (i = 0; i < NPML4EPG; i++) pm_pml4u[i] = pti_pml4[i]; } static void pmap_pinit_pml5_pti(vm_page_t pml5pgu) { pml5_entry_t *pm_pml5u; pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); pagezero(pm_pml5u); /* * Add pml5 entry at top of KVA pointing to existing pml4 pti * table, entering all kernel mappings needed for usermode * into level 5 table. */ pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = pmap_kextract((vm_offset_t)pti_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); } /* Allocate a page table page and do related bookkeeping */ static vm_page_t pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags) { vm_page_t m; - m = vm_page_alloc(NULL, pindex, flags | VM_ALLOC_NOOBJ); + m = vm_page_alloc_noobj(flags); if (__predict_false(m == NULL)) return (NULL); - + m->pindex = pindex; pmap_pt_page_count_adj(pmap, 1); - - if ((flags & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0) - pmap_zero_page(m); - return (m); } static void pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled) { /* * This function assumes the page will need to be unwired, * even though the counterpart allocation in pmap_alloc_pt_page() * doesn't enforce VM_ALLOC_WIRED. However, all current uses * of pmap_free_pt_page() require unwiring. The case in which * a PT page doesn't require unwiring because its ref_count has * naturally reached 0 is handled through _pmap_unwire_ptp(). */ vm_page_unwire_noq(m); if (zerofilled) vm_page_free_zero(m); else vm_page_free(m); pmap_pt_page_count_adj(pmap, -1); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { vm_page_t pmltop_pg, pmltop_pgu; vm_paddr_t pmltop_phys; int i; /* * allocate the page directory page */ - pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); + pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO | + VM_ALLOC_WAITOK); pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_pmltopu = NULL; pmap->pm_type = pm_type; /* * Do not install the host kernel mappings in the nested page * tables. These mappings are meaningless in the guest physical * address space. * Install minimal kernel mappings in PTI case. */ switch (pm_type) { case PT_X86: pmap->pm_cr3 = pmltop_phys; if (pmap_is_la57(pmap)) pmap_pinit_pml5(pmltop_pg); else pmap_pinit_pml4(pmltop_pg); if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { pmltop_pgu = pmap_alloc_pt_page(NULL, 0, - VM_ALLOC_WIRED | VM_ALLOC_NORMAL | - VM_ALLOC_WAITOK); + VM_ALLOC_WIRED | VM_ALLOC_WAITOK); pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(pmltop_pgu)); if (pmap_is_la57(pmap)) pmap_pinit_pml5_pti(pmltop_pgu); else pmap_pinit_pml4_pti(pmltop_pgu); pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); } if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_init(&pmap->pm_pkru, pkru_dup_range, pkru_free_range, pmap, M_NOWAIT); } break; case PT_EPT: case PT_RVI: pmap->pm_eptsmr = smr_create("pmap", 0, 0); break; } pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = flags; pmap->pm_eptgen = 0; return (1); } int pmap_pinit(pmap_t pmap) { return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } static void pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) { vm_page_t mpg; struct spglist free; mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME); if (mpg->ref_count != 0) return; SLIST_INIT(&free); _pmap_unwire_ptp(pmap, va, mpg, &free); pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } static pml4_entry_t * pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, bool addref) { vm_pindex_t pml5index; pml5_entry_t *pml5; pml4_entry_t *pml4; vm_page_t pml4pg; pt_entry_t PG_V; bool allocated; if (!pmap_is_la57(pmap)) return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); PG_V = pmap_valid_bit(pmap); pml5index = pmap_pml5e_index(va); pml5 = &pmap->pm_pmltop[pml5index]; if ((*pml5 & PG_V) == 0) { if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp, va) == NULL) return (NULL); allocated = true; } else { allocated = false; } pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); pml4 = &pml4[pmap_pml4e_index(va)]; if ((*pml4 & PG_V) == 0) { pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); if (allocated && !addref) pml4pg->ref_count--; else if (!allocated && addref) pml4pg->ref_count++; } return (pml4); } static pdp_entry_t * pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, bool addref) { vm_page_t pdppg; pml4_entry_t *pml4; pdp_entry_t *pdp; pt_entry_t PG_V; bool allocated; PG_V = pmap_valid_bit(pmap); pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); if (pml4 == NULL) return (NULL); if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp, va) == NULL) { if (pmap_is_la57(pmap)) pmap_allocpte_free_unref(pmap, va, pmap_pml5e(pmap, va)); return (NULL); } allocated = true; } else { allocated = false; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pmap_pdpe_index(va)]; if ((*pdp & PG_V) == 0) { pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); if (allocated && !addref) pdppg->ref_count--; else if (!allocated && addref) pdppg->ref_count++; } return (pdp); } /* * The ptepindexes, i.e. page indices, of the page table pages encountered * while translating virtual address va are defined as follows: * - for the page table page (last level), * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT, * in other words, it is just the index of the PDE that maps the page * table page. * - for the page directory page, * ptepindex = NUPDE (number of userland PD entries) + * (pmap_pde_index(va) >> NPDEPGSHIFT) * i.e. index of PDPE is put after the last index of PDE, * - for the page directory pointer page, * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + * NPML4EPGSHIFT), * i.e. index of pml4e is put after the last index of PDPE, * - for the PML4 page (if LA57 mode is enabled), * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), * i.e. index of pml5e is put after the last index of PML4E. * * Define an order on the paging entries, where all entries of the * same height are put together, then heights are put from deepest to * root. Then ptexpindex is the sequential number of the * corresponding paging entry in this order. * * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of * LA57 paging structures even in LA48 paging mode. Moreover, the * ptepindexes are calculated as if the paging structures were 5-level * regardless of the actual mode of operation. * * The root page at PML4/PML5 does not participate in this indexing scheme, * since it is statically allocated by pmap_pinit() and not by pmap_allocpte(). */ static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, vm_offset_t va) { vm_pindex_t pml5index, pml4index; pml5_entry_t *pml5, *pml5u; pml4_entry_t *pml4, *pml4u; pdp_entry_t *pdp; pd_entry_t *pd; vm_page_t m, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); /* * Allocate a page table page. */ m = pmap_alloc_pt_page(pmap, ptepindex, VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) return (NULL); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { MPASS(pmap_is_la57(pmap)); pml5index = pmap_pml5e_index(va); pml5 = &pmap->pm_pmltop[pml5index]; KASSERT((*pml5 & PG_V) == 0, ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { if (pmap->pm_ucr3 != PMAP_NO_CR3) *pml5 |= pg_nx; pml5u = &pmap->pm_pmltopu[pml5index]; *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } } else if (ptepindex >= NUPDE + NUPDPE) { pml4index = pmap_pml4e_index(va); /* Wire up a new PDPE page */ pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); if (pml4 == NULL) { pmap_free_pt_page(pmap, m, true); return (NULL); } KASSERT((*pml4 & PG_V) == 0, ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && pml4index < NUPML4E) { /* * PTI: Make all user-space mappings in the * kernel-mode page table no-execute so that * we detect any programming errors that leave * the kernel-mode page table active on return * to user space. */ if (pmap->pm_ucr3 != PMAP_NO_CR3) *pml4 |= pg_nx; pml4u = &pmap->pm_pmltopu[pml4index]; *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } } else if (ptepindex >= NUPDE) { /* Wire up a new PDE page */ pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); if (pdp == NULL) { pmap_free_pt_page(pmap, m, true); return (NULL); } KASSERT((*pdp & PG_V) == 0, ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else { /* Wire up a new PTE page */ pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); if (pdp == NULL) { pmap_free_pt_page(pmap, m, true); return (NULL); } if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va), lockp, va) == NULL) { pmap_allocpte_free_unref(pmap, va, pmap_pml4e(pmap, va)); pmap_free_pt_page(pmap, m, true); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); pdpg->ref_count++; } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ pd = &pd[pmap_pde_index(va)]; KASSERT((*pd & PG_V) == 0, ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } return (m); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. Sleep * occurs right before returning to the caller. This way, we never * drop pmap lock to sleep while a page table page has ref_count == 0, * which prevents the page from being freed under us. */ static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, vm_offset_t va) { vm_page_t m; m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va); if (m == NULL && lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); PMAP_ASSERT_NOT_IN_DI(); vm_wait(NULL); PMAP_LOCK(pmap); } return (m); } static pd_entry_t * pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, struct rwlock **lockp) { pdp_entry_t *pdpe, PG_V; pd_entry_t *pde; vm_page_t pdpg; vm_pindex_t pdpindex; PG_V = pmap_valid_bit(pmap); retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { pde = pmap_pdpe_to_pde(pdpe, va); if (va < VM_MAXUSER_ADDRESS) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); pdpg->ref_count++; } else pdpg = NULL; } else if (va < VM_MAXUSER_ADDRESS) { /* Allocate a pd page. */ pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va); if (pdpg == NULL) { if (lockp != NULL) goto retry; else return (NULL); } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(va)]; } else panic("pmap_alloc_pde: missing page table page for va %#lx", va); *pdpgp = pdpg; return (pde); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pd, PG_V; vm_page_t m; PG_V = pmap_valid_bit(pmap); /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pde(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. */ pd = NULL; } } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != NULL && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap %p resident count %ld != 0", pmap, pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap %p has reserved page table page(s)", pmap)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); if (pmap_is_la57(pmap)) { pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; pmap->pm_pmltop[PML5PML5I] = 0; } else { for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pmltop[KPML4BASE + i] = 0; #ifdef KASAN for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */ pmap->pm_pmltop[KASANPML4I + i] = 0; #endif #ifdef KMSAN for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */ pmap->pm_pmltop[KMSANSHADPML4I + i] = 0; for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */ pmap->pm_pmltop[KMSANORIGPML4I + i] = 0; #endif for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ pmap->pm_pmltop[DMPML4I + i] = 0; pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ for (i = 0; i < lm_ents; i++) /* Large Map */ pmap->pm_pmltop[LMSPML4I + i] = 0; } pmap_free_pt_page(NULL, m, true); if (pmap->pm_pmltopu != NULL) { m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> pm_pmltopu)); pmap_free_pt_page(NULL, m, false); } if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_fini(&pmap->pm_pkru); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_free, "LU", "Amount of KVM free"); #ifdef KMSAN static void pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t dummypa, dummypd, dummypt; int i, npde, npdpg; npdpg = howmany(size, NBPDP); npde = size / NBPDR; dummypa = vm_phys_early_alloc(-1, PAGE_SIZE); pagezero((void *)PHYS_TO_DMAP(dummypa)); dummypt = vm_phys_early_alloc(-1, PAGE_SIZE); pagezero((void *)PHYS_TO_DMAP(dummypt)); dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg); for (i = 0; i < npdpg; i++) pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i))); pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt); for (i = 0; i < NPTEPG; i++) pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx); pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd); for (i = 0; i < npde; i++) pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx); pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa); for (i = 0; i < npdpg; i++) pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V | X86_PG_RW | pg_nx); } static void pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end) { vm_size_t size; KASSERT(start % NBPDP == 0, ("unaligned page array start address")); /* * The end of the page array's KVA region is 2MB aligned, see * kmem_init(). */ size = round_2mpage(end) - start; pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size); pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size); } #endif /* * Allocate physical memory for the vm_page array and map it into KVA, * attempting to back the vm_pages with domain-local memory. */ void pmap_page_array_startup(long pages) { pdp_entry_t *pdpe; pd_entry_t *pde, newpdir; vm_offset_t va, start, end; vm_paddr_t pa; long pfn; int domain, i; vm_page_array_size = pages; start = VM_MIN_KERNEL_ADDRESS; end = start + pages * sizeof(struct vm_page); for (va = start; va < end; va += NBPDR) { pfn = first_page + (va - start) / sizeof(struct vm_page); domain = vm_phys_domain(ptoa(pfn)); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) { pa = vm_phys_early_alloc(domain, PAGE_SIZE); dump_add_page(pa); pagezero((void *)PHYS_TO_DMAP(pa)); *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); } pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) != 0) panic("Unexpected pde"); pa = vm_phys_early_alloc(domain, NBPDR); for (i = 0; i < NPDEPG; i++) dump_add_page(pa + i * PAGE_SIZE); newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS | pg_g | pg_nx); pde_store(pde, newpdir); } vm_page_array = (vm_page_t)start; #ifdef KMSAN pmap_kmsan_page_array_startup(start, end); #endif } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *pde, newpdir; pdp_entry_t *pdpe; mtx_assert(&kernel_map->system_mtx, MA_OWNED); /* * Return if "addr" is within the range of kernel page table pages * that were preallocated during pmap bootstrap. Moreover, leave * "kernel_vm_end" and the kernel page table as they were. * * The correctness of this action is based on the following * argument: vm_map_insert() allocates contiguous ranges of the * kernel virtual address space. It calls this function if a range * ends after "kernel_vm_end". If the kernel is mapped between * "kernel_vm_end" and "addr", then the range cannot begin at * "kernel_vm_end". In fact, its beginning address cannot be less * than the kernel. Thus, there is no immediate need to allocate * any new kernel page table pages between "kernel_vm_end" and * "KERNBASE". */ if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) return; addr = roundup2(addr, NBPDR); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); if (kernel_vm_end < addr) kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end); if (kernel_vm_end < addr) kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end); while (kernel_vm_end < addr) { pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); if ((*pdpe & X86_PG_V) == 0) { /* We need a new PDP entry */ nkpg = pmap_alloc_pt_page(kernel_pmap, kernel_vm_end >> PDPSHIFT, VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); paddr = VM_PAGE_TO_PHYS(nkpg); *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); continue; /* try again */ } pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if ((*pde & X86_PG_V) != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(kernel_vm_end), VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); paddr = VM_PAGE_TO_PHYS(nkpg); newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; pde_store(pde, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #ifdef PV_STATS static COUNTER_U64_DEFINE_EARLY(pc_chunk_count); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, "Current number of pv entry cnunks"); static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, "Total number of pv entry chunks allocated"); static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, "Total number of pv entry chunks freed"); static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, "Number of failed attempts to get a pv entry chunk page"); static COUNTER_U64_DEFINE_EARLY(pv_entry_frees); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, "Total number of pv entries freed"); static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, "Total number of pv entries allocated"); static COUNTER_U64_DEFINE_EARLY(pv_entry_count); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, "Current number of pv entries"); static COUNTER_U64_DEFINE_EARLY(pv_entry_spare); SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, "Current number of spare pv entries"); #endif static void reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) { if (pmap == NULL) return; pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); if (start_di) pmap_delayed_invl_finish(); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) { struct pv_chunks_list *pvc; struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed; bool start_di, restart; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; PG_G = PG_A = PG_M = PG_RW = 0; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; /* * A delayed invalidation block should already be active if * pmap_advise() or pmap_remove() called this function by way * of pmap_demote_pde_locked(). */ start_di = pmap_not_in_di(); pvc = &pv_chunks[domain]; mtx_lock(&pvc->pvc_lock); pvc->active_reclaims++; TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pvc->pvc_lock); /* * A pv_chunk can only be removed from the pc_lru list * when both pc_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { restart = false; reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); if (start_di) pmap_delayed_invl_start(); mtx_lock(&pvc->pvc_lock); restart = true; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { if (start_di) pmap_delayed_invl_start(); mtx_lock(&pvc->pvc_lock); restart = true; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pvc->pvc_lock); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } else if (start_di) pmap_delayed_invl_start(); PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); if (restart) continue; } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfq(inuse); pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_W) != 0) continue; tpte = pte_load_clear(pte); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_delayed_invl_page(m); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde, &free); freed++; } } if (freed == 0) { mtx_lock(&pvc->pvc_lock); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_adj(pmap, -freed); PV_STAT(counter_u64_add(pv_entry_frees, freed)); PV_STAT(counter_u64_add(pv_entry_spare, freed)); PV_STAT(counter_u64_add(pv_entry_count, -freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); PV_STAT(counter_u64_add(pc_chunk_count, -1)); PV_STAT(counter_u64_add(pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pvc->pvc_lock); TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pvc->pvc_lock); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); if (pvc->active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); } } } TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); pvc->active_reclaims--; mtx_unlock(&pvc->pvc_lock); reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->ref_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { vm_page_t m; int i, domain; domain = PCPU_GET(domain); for (i = 0; i < vm_ndomains; i++) { m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); if (m != NULL) break; domain = (domain + 1) % vm_ndomains; } return (m); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(counter_u64_add(pv_entry_frees, 1)); PV_STAT(counter_u64_add(pv_entry_spare, 1)); PV_STAT(counter_u64_add(pv_entry_count, -1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk_dequeued(struct pv_chunk *pc) { vm_page_t m; PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); PV_STAT(counter_u64_add(pc_chunk_count, -1)); PV_STAT(counter_u64_add(pc_chunk_frees, 1)); counter_u64_add(pv_page_count, -1); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } static void free_pv_chunk(struct pv_chunk *pc) { struct pv_chunks_list *pvc; pvc = &pv_chunks[pc_to_domain(pc)]; mtx_lock(&pvc->pvc_lock); TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); mtx_unlock(&pvc->pvc_lock); free_pv_chunk_dequeued(pc); } static void free_pv_chunk_batch(struct pv_chunklist *batch) { struct pv_chunks_list *pvc; struct pv_chunk *pc, *npc; int i; for (i = 0; i < vm_ndomains; i++) { if (TAILQ_EMPTY(&batch[i])) continue; pvc = &pv_chunks[i]; mtx_lock(&pvc->pvc_lock); TAILQ_FOREACH(pc, &batch[i], pc_list) { TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); } mtx_unlock(&pvc->pvc_lock); } for (i = 0; i < vm_ndomains; i++) { TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { free_pv_chunk_dequeued(pc); } } } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { struct pv_chunks_list *pvc; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(counter_u64_add(pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(counter_u64_add(pv_entry_count, 1)); PV_STAT(counter_u64_add(pv_entry_spare, -1)); return (pv); } } /* No free items, allocate another chunk */ - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(counter_u64_add(pc_chunk_tryfail, 1)); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } else counter_u64_add(pv_page_count, 1); PV_STAT(counter_u64_add(pc_chunk_count, 1)); PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; pvc = &pv_chunks[vm_page_domain(m)]; mtx_lock(&pvc->pvc_lock); TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); mtx_unlock(&pvc->pvc_lock); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(counter_u64_add(pv_entry_count, 1)); PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Returns the number of one bits within the given PV chunk map. * * The erratas for Intel processors state that "POPCNT Instruction May * Take Longer to Execute Than Expected". It is believed that the * issue is the spurious dependency on the destination register. * Provide a hint to the register rename logic that the destination * value is overwritten, by clearing it, as suggested in the * optimization manual. It should be cheap for unaffected processors * as well. * * Reference numbers for erratas are * 4th Gen Core: HSD146 * 5th Gen Core: BDM85 * 6th Gen Core: SKL029 */ static int popcnt_pc_map_pq(uint64_t *map) { u_long result, tmp; __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" : "=&r" (result), "=&r" (tmp) : "m" (map[0]), "m" (map[1]), "m" (map[2])); return (result); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pv_chunks_list *pvc; struct pch new_tail[PMAP_MEMDOM]; struct pv_chunk *pc; vm_page_t m; int avail, free, i; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ for (i = 0; i < PMAP_MEMDOM; i++) TAILQ_INIT(&new_tail[i]); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { #ifndef __POPCNT__ if ((cpu_feature2 & CPUID2_POPCNT) == 0) bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); else #endif free = popcnt_pc_map_pq(pc->pc_map); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } else counter_u64_add(pv_page_count, 1); PV_STAT(counter_u64_add(pc_chunk_count, 1)); PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } for (i = 0; i < vm_ndomains; i++) { if (TAILQ_EMPTY(&new_tail[i])) continue; pvc = &pv_chunks[i]; mtx_lock(&pvc->pvc_lock); TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); mtx_unlock(&pvc->pvc_lock); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining NPTEPG - 1 pv entries. */ PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1)); va_last = va + NBPDR - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1)); PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1))); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = pde & PG_PS_FRAME; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { struct rwlock *lock; boolean_t rv; lock = NULL; rv = pmap_demote_pde_locked(pmap, pde, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } static void pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) { #ifdef INVARIANTS #ifdef DIAGNOSTIC pt_entry_t *xpte, *ypte; for (xpte = firstpte; xpte < firstpte + NPTEPG; xpte++, newpte += PAGE_SIZE) { if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { printf("pmap_demote_pde: xpte %zd and newpte map " "different pages: found %#lx, expected %#lx\n", xpte - firstpte, *xpte, newpte); printf("page table dump\n"); for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) printf("%zd %#lx\n", ypte - firstpte, *ypte); panic("firstpte"); } } #else KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); #endif #endif } static void pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t oldpde, struct rwlock **lockp) { struct spglist free; vm_offset_t sva; SLIST_INIT(&free); sva = trunc_2mpage(va); pmap_remove_pde(pmap, pde, sva, &free, lockp); if ((oldpde & pmap_global_bit(pmap)) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", va, pmap); } static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; vm_paddr_t mptepa; vm_page_t mpte; int PG_PTE_CACHE; bool in_kernel; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); in_kernel = va >= VM_MAXUSER_ADDRESS; oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldpde & PG_A) == 0) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: a wired mapping is missing PG_A")); pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (FALSE); } mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * the direct map. Page table pages are preallocated * for every other part of the kernel address space, * so the direct map region is the only part of the * kernel address space that must be handled here. */ KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS), ("pmap_demote_pde: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), - (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | - VM_ALLOC_WIRED); + (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (mpte == NULL) { pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (FALSE); } if (!in_kernel) mpte->ref_count = NPTEPG; } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; newpte = pmap_swap_pat(pmap, newpte); /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) pmap_fill_ptp(firstpte, newpte); pmap_demote_pde_check(firstpte, newpte); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the PDE and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_pde() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldpde & PG_MANAGED) != 0) reserve_pv_entries(pmap, NPTEPG - 1, lockp); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ if (in_kernel) pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); counter_u64_add(pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", va, pmap); return (TRUE); } /* * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (mpte->valid != 0) pagezero((void *)PHYS_TO_DMAP(mptepa)); /* * Demote the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 2mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); if (oldpde & PG_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_page(m); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pde: pte page not promoted")); pmap_resident_count_adj(pmap, -1); KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_pde: pte page ref count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldpte, PG_A, PG_M, PG_RW; vm_page_t m; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap_resident_count_adj(pmap, -1); if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_delayed_invl_page(m); } return (pmap_unuse_pt(pmap, va, ptepde, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free) { struct rwlock *lock; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*pde & PG_V) == 0) return; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) return; lock = NULL; pmap_remove_pte(pmap, pte, va, *pde, free, &lock); if (lock != NULL) rw_wunlock(lock); pmap_invalidate_page(pmap, va); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) { pt_entry_t PG_G, *pte; vm_offset_t va; bool anyvalid; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_G = pmap_global_bit(pmap); anyvalid = false; va = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, sva += PAGE_SIZE) { if (*pte == 0) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } if ((*pte & PG_G) == 0) anyvalid = true; else if (va == eva) va = sva; if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { sva += PAGE_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); return (anyvalid); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_page_t mt; vm_offset_t va_next; pml5_entry_t *pml5e; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t PG_G, PG_V; struct spglist free; int anyvalid; PG_G = pmap_global_bit(pmap); PG_V = pmap_valid_bit(pmap); /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); pmap_delayed_invl_start(); PMAP_LOCK(pmap); pmap_pkru_on_remove(pmap, sva, eva); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pde = pmap_pde(pmap, sva); if (pde && (*pde & PG_PS) == 0) { pmap_remove_page(pmap, sva, pde, &free); goto out; } } lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; if (pmap_is_la57(pmap)) { pml5e = pmap_pml5e(pmap, sva); if ((*pml5e & PG_V) == 0) { va_next = (sva + NBPML5) & ~PML5MASK; if (va_next < sva) va_next = eva; continue; } pml4e = pmap_pml5e_to_pml4e(pml5e, sva); } else { pml4e = pmap_pml4e(pmap, sva); } if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) continue; if ((*pdpe & PG_PS) != 0) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G mapping " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, sva, eva, va_next)); MPASS(pmap != kernel_pmap); /* XXXKIB */ MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); anyvalid = 1; *pdpe = 0; pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE); mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); pmap_unwire_ptp(pmap, sva, mt, &free); continue; } /* * Calculate index for next page table. */ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, pde, sva, &free, &lock); continue; } else if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { /* The large page mapping was destroyed. */ continue; } else ptpaddr = *pde; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) anyvalid = 1; } if (lock != NULL) rw_wunlock(lock); out: if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finish(); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; pd_entry_t *pde; vm_offset_t va; struct spglist free; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); rw_wlock(lock); retry: while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, va); (void)pmap_demote_pde_locked(pmap, pde, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pmap_resident_count_adj(pmap, -1); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); pmap_delayed_invl_wait(m); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_pde: do the things to protect a 2mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_page_t m, mt; boolean_t anychanged; pt_entry_t PG_G, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 2mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((prot & VM_PROT_WRITE) == 0) { if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } newpde &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_page_t m; vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; pt_entry_t obits, pbits; boolean_t anychanged; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; /* * Although this function delays and batches the invalidation * of stale TLB entries, it does not need to call * pmap_delayed_invl_start() and * pmap_delayed_invl_finish(), because it does not * ordinarily destroy mappings. Stale TLB entries from * protection-only changes need only be invalidated before the * pmap lock is released, because protection-only changes do * not destroy PV entries. Even operations that iterate over * a physical page's PV list of mappings, like * pmap_remove_write(), acquire the pmap lock for each * mapping. Consequently, for protection-only changes, the * pmap lock suffices to synchronize both page table and TLB * updates. * * This function only destroys a mapping if pmap_demote_pde() * fails. In that case, stale TLB entries are immediately * invalidated. */ PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if (pml4e == NULL || (*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) continue; if ((*pdpe & PG_PS) != 0) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G mapping " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, sva, eva, va_next)); retry_pdpe: obits = pbits = *pdpe; MPASS((pbits & (PG_MANAGED | PG_G)) == 0); MPASS(pmap != kernel_pmap); /* XXXKIB */ if ((prot & VM_PROT_WRITE) == 0) pbits &= ~(PG_RW | PG_M); if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; if (pbits != obits) { if (!atomic_cmpset_long(pdpe, obits, pbits)) /* PG_PS cannot be cleared under us, */ goto retry_pdpe; anychanged = TRUE; } continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, pde, sva, prot)) anychanged = TRUE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { retry: obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; if (pbits != obits) { if (!atomic_cmpset_long(pte, obits, pbits)) goto retry; if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } #if VM_NRESERVLEVEL > 0 static bool pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) { if (pmap->pm_type != PT_EPT) return (false); return ((pde & EPT_PG_EXECUTE) != 0); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; vm_page_t mpte; int PG_PTE_CACHE; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) || !pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } setpde: if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } setpte: if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK), pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == pmap_pde_pindex(va), ("pmap_promote_pde: page table page's pindex is wrong " "mpte %p pidx %#lx va %#lx va pde pidx %#lx", mpte, mpte->pindex, va, pmap_pde_pindex(va))); if (pmap_insert_pt_page(pmap, mpte, true)) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); /* * Propagate the PAT index to its proper position. */ newpde = pmap_swap_pat(pmap, newpde); /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); counter_u64_add(pmap_pde_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" " in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ static int pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, int psind) { vm_page_t mp; pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0, ("psind %d unexpected", psind)); KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, ("unaligned phys address %#lx newpte %#lx psind %d", newpte & PG_FRAME, newpte, psind)); KASSERT((va & (pagesizes[psind] - 1)) == 0, ("unaligned va %#lx psind %d", va, psind)); KASSERT(va < VM_MAXUSER_ADDRESS, ("kernel mode non-transparent superpage")); /* XXXKIB */ KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ PG_V = pmap_valid_bit(pmap); restart: if (!pmap_pkru_same(pmap, va, va + pagesizes[psind])) return (KERN_PROTECTION_FAILURE); pten = newpte; if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) pten |= pmap_pkru_get(pmap, va); if (psind == 2) { /* 1G */ pml4e = pmap_pml4e(pmap, va); if (pml4e == NULL || (*pml4e & PG_V) == 0) { mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va), NULL, va); if (mp == NULL) goto allocf; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); pdpe = &pdpe[pmap_pdpe_index(va)]; origpte = *pdpe; MPASS(origpte == 0); } else { pdpe = pmap_pml4e_to_pdpe(pml4e, va); KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); origpte = *pdpe; if ((origpte & PG_V) == 0) { mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); mp->ref_count++; } } *pdpe = pten; } else /* (psind == 1) */ { /* 2M */ pde = pmap_pde(pmap, va); if (pde == NULL) { mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va), NULL, va); if (mp == NULL) goto allocf; pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); pde = &pde[pmap_pde_index(va)]; origpte = *pde; MPASS(origpte == 0); } else { origpte = *pde; if ((origpte & PG_V) == 0) { pdpe = pmap_pdpe(pmap, va); MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); mp->ref_count++; } } *pde = pten; } KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)), ("va %#lx changing %s phys page origpte %#lx pten %#lx", va, psind == 2 ? "1G" : "2M", origpte, pten)); if ((pten & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; if ((origpte & PG_V) == 0) pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE); return (KERN_SUCCESS); allocf: if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); goto restart; } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. * * When destroying both a page table and PV entry, this function * performs the TLB invalidation before releasing the PV list * lock, so we do not need pmap_delayed_invl_page() calls here. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv; boolean_t nosleep; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = trunc_page(va); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if ((newpte & PG_RW) != 0) newpte |= PG_M; } else newpte |= PG_MANAGED; lock = NULL; PMAP_LOCK(pmap); if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed largepage va %#lx flags %#x", va, flags)); rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, psind); goto out; } if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va); if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || pmap_demote_pde_locked(pmap, pde, va, &lock))) { pte = pmap_pde_to_pte(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); mpte->ref_count++; } } else if (va < VM_MAXUSER_ADDRESS) { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va), nosleep ? NULL : &lock, va); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: invalid page directory va=%#lx", va); origpte = *pte; pv = NULL; if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) newpte |= pmap_pkru_get(pmap, va); /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ origpte = pte_load_clear(pte); KASSERT((origpte & PG_FRAME) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) { pmap_invalidate_page(pmap, va); vm_page_aflag_set(om, PGA_REFERENCED); } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#lx", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } else { /* * Since this mapping is unmanaged, assume that PG_A * is set. */ pmap_invalidate_page(pmap, va); } origpte = 0; } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_adj(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); KASSERT((origpte & PG_FRAME) == pa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->ref_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va, &lock); #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_V = pmap_valid_bit(pmap); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Returns true if every page table entry in the specified page table page is * zero. */ static bool pmap_every_pte_zero(vm_paddr_t pa) { pt_entry_t *pt_end, *pte; KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); pte = (pt_entry_t *)PHYS_TO_DMAP(pa); for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { if (*pte != 0) return (false); } return (true); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t oldpde, *pde; pt_entry_t PG_G, PG_RW, PG_V; vm_page_t mt, pdpg; KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, ("pmap_enter_pde: cannot create wired user mapping")); PG_G = pmap_global_bit(pmap); PG_RW = pmap_rw_bit(pmap); KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) { CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } /* * If pkru is not same for the whole pde range, return failure * and let vm_fault() cope. Check after pde allocation, since * it could sleep. */ if (!pmap_pkru_same(pmap, va, va + NBPDR)) { pmap_abort_ptp(pmap, va, pdpg); return (KERN_PROTECTION_FAILURE); } if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { newpde &= ~X86_PG_PKU_MASK; newpde |= pmap_pkru_get(pmap, va); } /* * If there are existing mappings, either abort or remove them. */ oldpde = *pde; if ((oldpde & PG_V) != 0) { KASSERT(pdpg == NULL || pdpg->ref_count > 1, ("pmap_enter_pde: pdpg's reference count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (va < VM_MAXUSER_ADDRESS || (oldpde & PG_PS) != 0 || !pmap_every_pte_zero(oldpde & PG_FRAME))) { if (pdpg != NULL) pdpg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldpde & PG_PS) != 0) { /* * The reference to the PD page that was acquired by * pmap_alloc_pde() ensures that it won't be freed. * However, if the PDE resulted from a promotion, then * a reserved PT page could be freed. */ (void)pmap_remove_pde(pmap, pde, va, &free, lockp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { pmap_delayed_invl_start(); if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, lockp)) pmap_invalidate_all(pmap); pmap_delayed_invl_finish(); } if (va < VM_MAXUSER_ADDRESS) { vm_page_free_pages_toq(&free, true); KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); } else { KASSERT(SLIST_EMPTY(&free), ("pmap_enter_pde: freed kernel page table page")); /* * Both pmap_remove_pde() and pmap_remove_ptes() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_pde: trie insert failed"); } } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { if (pdpg != NULL) pmap_abort_ptp(pmap, va, pdpg); CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); counter_u64_add(pmap_pde_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { pt_entry_t newpte, *pte, PG_V; KASSERT(!VA_IS_CLEANMAP(va) || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pd_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->ref_count++; } else { /* * Get the page directory entry */ ptepa = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = pmap_allocpte_alloc(pmap, ptepindex, NULL, va); if (mpte == NULL) return (mpte); } } pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); pte = &pte[pmap_pte_index(va)]; } else { mpte = NULL; pte = vtopte(va); } if (*pte) { if (mpte != NULL) mpte->ref_count--; return (NULL); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) pmap_abort_ptp(pmap, va, mpte); return (NULL); } /* * Increment counters */ pmap_resident_count_adj(pmap, 1); newpte = VM_PAGE_TO_PHYS(m) | PG_V | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U | pmap_pkru_get(pmap, va); pte_store(pte, newpte); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa, ptepa; vm_page_t p, pdpg; int pat_mode; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!pmap_ps_enabled(pmap)) return; if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2MB pages. Since "ptepa" is 2M aligned and * "size" is a multiple of 2M, adding the PAT setting to "pa" * will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); if (pde == NULL) { /* * The creation of mappings below is only an * optimization. If a page directory page * cannot be allocated without blocking, * continue on to the next mapping rather than * blocking. */ addr += NBPDR; continue; } if ((*pde & PG_V) == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); counter_u64_add(pmap_pde_mappings, 1); } else { /* Continue on if the PDE is already valid. */ pdpg->ref_count--; KASSERT(pdpg->ref_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", addr)); } addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware * feature, so there is no need to invalidate any TLB entries. * Since pmap_demote_pde() for the wired entry must never fail, * pmap_delayed_invl_start()/finish() calls around the * function are not needed. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V, PG_G; PG_V = pmap_valid_bit(pmap); PG_G = pmap_global_bit(pmap); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if (pml4e == NULL || (*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) continue; if ((*pdpe & PG_PS) != 0) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G mapping " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, sva, eva, va_next)); MPASS(pmap != kernel_pmap); /* XXXKIB */ MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); atomic_clear_long(pdpe, PG_W); pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { atomic_clear_long(pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(pte, PG_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde, srcptepaddr; pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_pdpg, dstmpte, srcmpte; if (dst_addr != src_addr) return; if (dst_pmap->pm_type != src_pmap->pm_type) return; /* * EPT page table entries that require emulation of A/D bits are * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit * (aka EPT_PG_EXECUTE) could still be set. Since some EPT * implementations flag an EPT misconfiguration for exec-only * mappings we skip this function entirely for emulated pmaps. */ if (pmap_emulate_ad_bits(dst_pmap)) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } PG_A = pmap_accessed_bit(dst_pmap); PG_M = pmap_modified_bit(dst_pmap); PG_V = pmap_valid_bit(dst_pmap); for (addr = src_addr; addr < end_addr; addr = va_next) { KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pml4e = pmap_pml4e(src_pmap, addr); if (pml4e == NULL || (*pml4e & PG_V) == 0) { va_next = (addr + NBPML4) & ~PML4MASK; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + NBPDP) & ~PDPMASK; if (va_next < addr) va_next = end_addr; pdpe = pmap_pml4e_to_pdpe(pml4e, addr); if ((*pdpe & PG_V) == 0) continue; if ((*pdpe & PG_PS) != 0) { KASSERT(va_next <= end_addr, ("partial update of non-transparent 1G mapping " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, addr, end_addr, va_next)); MPASS((addr & PDPMASK) == 0); MPASS((*pdpe & PG_MANAGED) == 0); srcptepaddr = *pdpe; pdpe = pmap_pdpe(dst_pmap, addr); if (pdpe == NULL) { if (pmap_allocpte_alloc(dst_pmap, pmap_pml4e_pindex(addr), NULL, addr) == NULL) break; pdpe = pmap_pdpe(dst_pmap, addr); } else { pml4e = pmap_pml4e(dst_pmap, addr); dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); dst_pdpg->ref_count++; } KASSERT(*pdpe == 0, ("1G mapping present in dst pmap " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, addr, end_addr, va_next)); *pdpe = srcptepaddr & ~PG_W; pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE); continue; } va_next = (addr + NBPDR) & ~PDRMASK; if (va_next < addr) va_next = end_addr; pde = pmap_pdpe_to_pde(pdpe, addr); srcptepaddr = *pde; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { /* * We can only virtual copy whole superpages. */ if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL); if (pde == NULL) break; if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { /* * We leave the dirty bit unchanged because * managed read/write superpage mappings are * required to be dirty. However, managed * superpage mappings are not required to * have their accessed bit set, so we clear * it because we don't know if this mapping * will be used. */ srcptepaddr &= ~PG_W; if ((srcptepaddr & PG_MANAGED) != 0) srcptepaddr &= ~PG_A; *pde = srcptepaddr; pmap_resident_count_adj(dst_pmap, NBPDR / PAGE_SIZE); counter_u64_add(pmap_pde_mappings, 1); } else pmap_abort_ptp(dst_pmap, addr, dst_pdpg); continue; } srcptepaddr &= PG_FRAME; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->ref_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_pte_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = *src_pte; /* * We only virtual copy managed pages. */ if ((ptetemp & PG_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_pde_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->ref_count++; } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_pte_index(addr)]; if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); pmap_resident_count_adj(dst_pmap, 1); } else { pmap_abort_ptp(dst_pmap, addr, dstmpte); goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->ref_count >= srcmpte->ref_count) break; } } out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) { int error; if (dst_pmap->pm_type != src_pmap->pm_type || dst_pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (0); for (;;) { if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } error = pmap_pkru_copy(dst_pmap, src_pmap); /* Clean up partial copy on failure due to no memory. */ if (error == ENOMEM) pmap_pkru_deassign_all(dst_pmap); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } /* * Zero the specified hardware page. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * Copy 1 specified hardware page to another. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t pages[2]; vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; int cnt; boolean_t mapped; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; pages[0] = ma[a_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; pages[1] = mb[b_offset >> PAGE_SHIFT]; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); a_cp = (char *)vaddr[0] + a_pg_offset; b_cp = (char *)vaddr[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); if (__predict_false(mapped)) pmap_unmap_io_transient(pages, vaddr, 2, FALSE); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. * * Although this function destroys all of the pmap's managed, * non-wired mappings, it can delay and batch the invalidation of TLB * entries without calling pmap_delayed_invl_start() and * pmap_delayed_invl_finish(). Because the pmap is not active on * any other processor, none of these TLB entries will ever be used * before their eventual invalidation. Consequently, there is no need * for either pmap_remove_all() or pmap_remove_write() to wait for * that eventual TLB invalidation. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t ptepde; pt_entry_t *pte, tpte; pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; struct pv_chunklist free_chunks[PMAP_MEMDOM]; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, i, idx; boolean_t superpage; vm_paddr_t pa; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); #ifdef INVARIANTS { cpuset_t other_cpus; other_cpus = all_cpus; critical_enter(); CPU_CLR(PCPU_GET(cpuid), &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); critical_exit(); KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); } #endif lock = NULL; PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); for (i = 0; i < PMAP_MEMDOM; i++) TAILQ_INIT(&free_chunks[i]); SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfq(inuse); bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pdpe(pmap, pv->pv_va); ptepde = *pte; pte = pmap_pdpe_to_pde(pte, pv->pv_va); tpte = *pte; if ((tpte & (PG_PS | PG_V)) == PG_V) { superpage = FALSE; ptepde = tpte; pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & PG_FRAME); pte = &pte[pmap_pte_index(pv->pv_va)]; tpte = *pte; } else { /* * Keep track whether 'tpte' is a * superpage explicitly instead of * relying on PG_PS being set. * * This is because PG_PS is numerically * identical to PG_PTE_PAT and thus a * regular page could be mistaken for * a superpage. */ superpage = TRUE; } if ((tpte & PG_V) == 0) { panic("bad pte va %lx pte %lx", pv->pv_va, tpte); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } /* Mark free */ pc->pc_map[field] |= bitmask; /* * Because this pmap is not active on other * processors, the dirty bit cannot have * changed state since we last loaded pte. */ pte_clear(pte); if (superpage) pa = tpte & PG_PS_FRAME; else pa = tpte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (superpage) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); if (superpage) { pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if ((mt->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_adj(pmap, -1); KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_pages: pte page reference count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap_resident_count_adj(pmap, -1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(counter_u64_add(pv_entry_frees, freed)); PV_STAT(counter_u64_add(pv_entry_spare, freed)); PV_STAT(counter_u64_add(pv_entry_count, -freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); pmap_pkru_deassign_all(pmap); free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask; pt_entry_t PG_A, PG_M, PG_RW, PG_V; pmap_t pmap; int md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte, PG_V; boolean_t rv; PG_V = pmap_valid_bit(pmap); rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pd_entry_t *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW; vm_offset_t va; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); rw_wlock(lock); retry: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde_locked(pmap, pde, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_write: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); oldpte = *pte; if (oldpte & PG_RW) { while (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~(PG_RW | PG_M))) cpu_spinwait(); if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_wait(m); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { if (!pmap_emulate_ad_bits(pmap)) return (TRUE); KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); /* * XWR = 010 or 110 will cause an unconditional EPT misconfiguration * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared * if the EPT_PG_WRITE bit is set. */ if ((pte & EPT_PG_WRITE) != 0) return (FALSE); /* * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. */ if ((pte & EPT_PG_EXECUTE) == 0 || ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) return (TRUE); else return (FALSE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). * * A DI block is not needed within this function, because * invalidations are performed before the PV list lock is * released. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW; vm_offset_t va; vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; struct spglist free; boolean_t demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va); oldpde = *pde; if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "oldpde" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((oldpde & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (oldpde & PG_W) == 0) { if (safe_to_clear_referenced(pmap, oldpde)) { atomic_clear_long(pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); demoted = FALSE; } else if (pmap_demote_pde_locked(pmap, pde, pv->pv_va, &lock)) { /* * Remove the mapping to a single page * so that a subsequent access may * repromote. Since the underlying * page table page is fully populated, * this removal never frees a page * table page. */ demoted = TRUE; va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); pmap_invalidate_page(pmap, va); } else demoted = TRUE; if (demoted) { /* * The superpage mapping was removed * entirely and therefore 'pv' is no * longer valid. */ if (pvf == pv) pvf = NULL; pv = NULL; } cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { if (safe_to_clear_referenced(pmap, *pte)) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else if ((*pte & PG_W) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_pte(pmap, pte, pv->pv_va, *pde, &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; vm_offset_t va, va_next; vm_page_t m; bool anychanged; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; /* * A/D bit emulation requires an alternate code path when clearing * the modified and accessed bits below. Since this function is * advisory in nature we skip it entirely for pmaps that require * A/D bit emulation. */ if (pmap_emulate_ad_bits(pmap)) return; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = false; pmap_delayed_invl_start(); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if (pml4e == NULL || (*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) continue; if ((*pdpe & PG_PS) != 0) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G mapping " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, sva, eva, va_next)); continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(va_next, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldpde & PG_W) == 0) { va = eva; if (va > va_next) va = va_next; va -= PAGE_SIZE; KASSERT(va >= sva, ("pmap_advise: no address gap")); pte = pmap_pde_to_pte(pde, va); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); anychanged = true; } if (lock != NULL) rw_wunlock(lock); } if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_long(pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_long(pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == va_next) va = sva; } else anychanged = true; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finish(); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_M, PG_RW; struct rwlock *lock; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); oldpde = *pde; /* If oldpde has PG_RW set, then it also has PG_M set. */ if ((oldpde & PG_RW) != 0 && pmap_demote_pde_locked(pmap, pde, va, &lock) && (oldpde & PG_W) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); atomic_clear_long(pte, PG_M | PG_RW); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { atomic_clear_long(pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } /* * Miscellaneous support routines follow */ /* Adjust the properties for a leaf page table entry. */ static __inline void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask) { u_long opte, npte; opte = *(u_long *)pte; do { npte = opte & ~mask; npte |= bits; } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte, npte)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ static void * pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = trunc_page(pa); if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && (ppim->mode == mode || (flags & MAPDEV_SETATTR) == 0)) return ((void *)(ppim->va + offset)); } /* * If the specified range of physical addresses fits within * the direct map window, use the direct map. */ if (pa < dmaplimit && pa + size <= dmaplimit) { va = PHYS_TO_DMAP(pa); if ((flags & MAPDEV_SETATTR) != 0) { PMAP_LOCK(kernel_pmap); i = pmap_change_props_locked(va, size, PROT_NONE, mode, flags); PMAP_UNLOCK(kernel_pmap); } else i = 0; if (!i) return ((void *)(va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); if ((flags & MAPDEV_FLUSHCACHE) != 0) pmap_invalidate_cache_range(va, va + tmpsize); return ((void *)(va + offset)); } void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | MAPDEV_SETATTR)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, MAPDEV_SETATTR)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, MAPDEV_FLUSHCACHE)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) { pmap_qremove(va, atop(size)); kva_free(va, size); } } /* * Tries to demote a 1GB page mapping. */ static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) { pdp_entry_t newpdpe, oldpdpe; pd_entry_t *firstpde, newpde, *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pdpgpa; vm_page_t pdpg; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpdpe = *pdpe; KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT); if (pdpg == NULL) { CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } pdpgpa = VM_PAGE_TO_PHYS(pdpg); firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; KASSERT((oldpdpe & PG_A) != 0, ("pmap_demote_pdpe: oldpdpe is missing PG_A")); KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pdpe: oldpdpe is missing PG_M")); newpde = oldpdpe; /* * Initialize the page directory page. */ for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { *pde = newpde; newpde += NBPDR; } /* * Demote the mapping. */ *pdpe = newpdpe; /* * Invalidate a stale recursive mapping of the page directory page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); counter_u64_add(pmap_pdpe_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pat_mode)) panic("memory attribute change on the direct map failed"); } void pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) { int error; m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; PMAP_LOCK(kernel_pmap); error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0); PMAP_UNLOCK(kernel_pmap); if (error != 0) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_props_locked(va, size, PROT_NONE, mode, MAPDEV_FLUSHCACHE); PMAP_UNLOCK(kernel_pmap); return (error); } /* * Changes the specified virtual address range's protections to those * specified by "prot". Like pmap_change_attr(), protections for aliases * in the direct map are updated as well. Protections on aliasing mappings may * be a subset of the requested protections; for example, mappings in the direct * map are never executable. */ int pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) { int error; /* Only supported within the kernel map. */ if (va < VM_MIN_KERNEL_ADDRESS) return (EINVAL); PMAP_LOCK(kernel_pmap); error = pmap_change_props_locked(va, size, prot, -1, MAPDEV_ASSERTVALID); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, int mode, int flags) { vm_offset_t base, offset, tmpva; vm_paddr_t pa_start, pa_end, pa_end1; pdp_entry_t *pdpe; pd_entry_t *pde, pde_bits, pde_mask; pt_entry_t *pte, pte_bits, pte_mask; int error; bool changed; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ if (base < DMAP_MIN_ADDRESS) return (EINVAL); /* * Construct our flag sets and masks. "bits" is the subset of * "mask" that will be set in each modified PTE. * * Mappings in the direct map are never allowed to be executable. */ pde_bits = pte_bits = 0; pde_mask = pte_mask = 0; if (mode != -1) { pde_bits |= pmap_cache_bits(kernel_pmap, mode, true); pde_mask |= X86_PG_PDE_CACHE; pte_bits |= pmap_cache_bits(kernel_pmap, mode, false); pte_mask |= X86_PG_PTE_CACHE; } if (prot != VM_PROT_NONE) { if ((prot & VM_PROT_WRITE) != 0) { pde_bits |= X86_PG_RW; pte_bits |= X86_PG_RW; } if ((prot & VM_PROT_EXECUTE) == 0 || va < VM_MIN_KERNEL_ADDRESS) { pde_bits |= pg_nx; pte_bits |= pg_nx; } pde_mask |= X86_PG_RW | pg_nx; pte_mask |= X86_PG_RW | pg_nx; } /* * Pages that aren't mapped aren't supported. Also break down 2MB pages * into 4KB pages if required. */ for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (pdpe == NULL || *pdpe == 0) { KASSERT((flags & MAPDEV_ASSERTVALID) == 0, ("%s: addr %#lx is not mapped", __func__, tmpva)); return (EINVAL); } if (*pdpe & PG_PS) { /* * If the current 1GB page already has the required * properties, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ if ((*pdpe & pde_mask) == pde_bits) { tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } /* * If the current offset aligns with a 1GB page frame * and there is at least 1GB left within the range, then * we need not break down this page into 2MB pages. */ if ((tmpva & PDPMASK) == 0 && tmpva + PDPMASK < base + size) { tmpva += NBPDP; continue; } if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) return (ENOMEM); } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde == 0) { KASSERT((flags & MAPDEV_ASSERTVALID) == 0, ("%s: addr %#lx is not mapped", __func__, tmpva)); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2MB page already has the required * properties, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ if ((*pde & pde_mask) == pde_bits) { tmpva = trunc_2mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2MB page frame * and there is at least 2MB left within the range, then * we need not break down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) return (ENOMEM); } pte = pmap_pde_to_pte(pde, tmpva); if (*pte == 0) { KASSERT((flags & MAPDEV_ASSERTVALID) == 0, ("%s: addr %#lx is not mapped", __func__, tmpva)); return (EINVAL); } tmpva += PAGE_SIZE; } error = 0; /* * Ok, all the pages exist, so run through them updating their * properties if required. */ changed = false; pa_start = pa_end = 0; for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (*pdpe & PG_PS) { if ((*pdpe & pde_mask) != pde_bits) { pmap_pte_props(pdpe, pde_bits, pde_mask); changed = true; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pdpe & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } else if (pa_end == (*pdpe & PG_PS_FRAME)) pa_end += NBPDP; else { /* Run ended, update direct map. */ error = pmap_change_props_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, prot, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } } tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde & PG_PS) { if ((*pde & pde_mask) != pde_bits) { pmap_pte_props(pde, pde_bits, pde_mask); changed = true; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pde & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } else if (pa_end == (*pde & PG_PS_FRAME)) pa_end += NBPDR; else { /* Run ended, update direct map. */ error = pmap_change_props_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, prot, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } } tmpva = trunc_2mpage(tmpva) + NBPDR; } else { pte = pmap_pde_to_pte(pde, tmpva); if ((*pte & pte_mask) != pte_bits) { pmap_pte_props(pte, pte_bits, pte_mask); changed = true; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } else if (pa_end == (*pte & PG_FRAME)) pa_end += PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_props_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, prot, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } } tmpva += PAGE_SIZE; } } if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { pa_end1 = MIN(pa_end, dmaplimit); if (pa_start != pa_end1) error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start), pa_end1 - pa_start, prot, mode, flags); } /* * Flush CPU caches if required to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); if ((flags & MAPDEV_FLUSHCACHE) != 0) pmap_invalidate_cache_range(base, tmpva); } return (error); } /* * Demotes any mapping within the direct map region that covers more than the * specified range of physical addresses. This range's size must be a power * of two and its starting address must be a multiple of its size. Since the * demotion does not change any attributes of the mapping, a TLB invalidation * is not mandatory. The caller may, however, request a TLB invalidation. */ void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_offset_t va; boolean_t changed; if (len == 0) return; KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); KASSERT((base & (len - 1)) == 0, ("pmap_demote_DMAP: base is not a multiple of len")); if (len < NBPDP && base < dmaplimit) { va = PHYS_TO_DMAP(base); changed = FALSE; PMAP_LOCK(kernel_pmap); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDPE"); if ((*pdpe & PG_PS) != 0) { if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) panic("pmap_demote_DMAP: PDPE failed"); changed = TRUE; } if (len < NBPDR) { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDE"); if ((*pde & PG_PS) != 0) { if (!pmap_demote_pde(kernel_pmap, pde, va)) panic("pmap_demote_DMAP: PDE failed"); changed = TRUE; } } if (changed && invalidate) pmap_invalidate_page(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can * find other mappings. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { pdp_entry_t *pdpe; pd_entry_t *pdep; pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa; int val; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK(pmap); pte = 0; pa = 0; val = 0; pdpe = pmap_pdpe(pmap, addr); if (pdpe == NULL) goto out; if ((*pdpe & PG_V) != 0) { if ((*pdpe & PG_PS) != 0) { pte = *pdpe; pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & PG_FRAME; val = MINCORE_PSIND(2); } else { pdep = pmap_pde(pmap, addr); if (pdep != NULL && (*pdep & PG_V) != 0) { if ((*pdep & PG_PS) != 0) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((pte & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_PSIND(1); } else { pte = *pmap_pde_to_pte(pdep, addr); pa = pte & PG_FRAME; val = 0; } } } } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { *pap = pa; } out: PMAP_UNLOCK(pmap); return (val); } static uint64_t pmap_pcid_alloc(pmap_t pmap, u_int cpuid) { uint32_t gen, new_gen, pcid_next; CRITICAL_ASSERT(curthread); gen = PCPU_GET(pcid_gen); if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) return (pti ? 0 : CR3_PCID_SAVE); if (pmap->pm_pcids[cpuid].pm_gen == gen) return (CR3_PCID_SAVE); pcid_next = PCPU_GET(pcid_next); KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), ("cpu %d pcid_next %#x", cpuid, pcid_next)); if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { new_gen = gen + 1; if (new_gen == 0) new_gen = 1; PCPU_SET(pcid_gen, new_gen); pcid_next = PMAP_PCID_KERN + 1; } else { new_gen = gen; } pmap->pm_pcids[cpuid].pm_pcid = pcid_next; pmap->pm_pcids[cpuid].pm_gen = new_gen; PCPU_SET(pcid_next, pcid_next + 1); return (0); } static uint64_t pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) { uint64_t cached; cached = pmap_pcid_alloc(pmap, cpuid); KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, ("pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, ("non-kernel pmap pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); return (cached); } static void pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) { PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; } static void pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) { pmap_t old_pmap; uint64_t cached, cr3, kcr3, ucr3; KASSERT((read_rflags() & PSL_I) == 0, ("PCID needs interrupts disabled in pmap_activate_sw()")); /* See the comment in pmap_invalidate_page_pcid(). */ if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); old_pmap = PCPU_GET(curpmap); MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); old_pmap->pm_pcids[cpuid].pm_gen = 0; } cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); PCPU_SET(curpmap, pmap); kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | PMAP_PCID_USER_PT; if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); if (cached) counter_u64_add(pcid_save_cnt, 1); pmap_activate_sw_pti_post(td, pmap); } static void pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid) { uint64_t cached, cr3; KASSERT((read_rflags() & PSL_I) == 0, ("PCID needs interrupts disabled in pmap_activate_sw()")); cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | cached); PCPU_SET(curpmap, pmap); if (cached) counter_u64_add(pcid_save_cnt, 1); } static void pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid __unused) { load_cr3(pmap->pm_cr3); PCPU_SET(curpmap, pmap); } static void pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, u_int cpuid __unused) { pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); PCPU_SET(kcr3, pmap->pm_cr3); PCPU_SET(ucr3, pmap->pm_ucr3); pmap_activate_sw_pti_post(td, pmap); } DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, u_int)) { if (pmap_pcid_enabled && pti) return (pmap_activate_sw_pcid_pti); else if (pmap_pcid_enabled && !pti) return (pmap_activate_sw_pcid_nopti); else if (!pmap_pcid_enabled && pti) return (pmap_activate_sw_nopcid_pti); else /* if (!pmap_pcid_enabled && !pti) */ return (pmap_activate_sw_nopcid_nopti); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; u_int cpuid; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (oldpmap == pmap) { if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); return; } cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif pmap_activate_sw_mode(td, pmap, cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); #endif } void pmap_activate(struct thread *td) { /* * invltlb_{invpcid,}_pcid_handler() is used to handle an * invalidate_all IPI, which checks for curpmap == * smp_tlb_pmap. The below sequence of operations has a * window where %CR3 is loaded with the new pmap's PML4 * address, but the curpmap value has not yet been updated. * This causes the invltlb IPI handler, which is called * between the updates, to execute as a NOP, which leaves * stale TLB entries. * * Note that the most common use of pmap_activate_sw(), from * a context switch, is immune to this race, because * interrupts are disabled (while the thread lock is owned), * so the IPI is delayed until after curpmap is updated. Protect * other callers in a similar way, by disabling interrupts * around the %cr3 register reload and curpmap assignment. */ spinlock_enter(); pmap_activate_sw(td); spinlock_exit(); } void pmap_activate_boot(pmap_t pmap) { uint64_t kcr3; u_int cpuid; /* * kernel_pmap must be never deactivated, and we ensure that * by never activating it at all. */ MPASS(pmap != kernel_pmap); cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); if (pti) { kcr3 = pmap->pm_cr3; if (pmap_pcid_enabled) kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; } else { kcr3 = PMAP_NO_CR3; } PCPU_SET(kcr3, kcr3); PCPU_SET(ucr3, PMAP_NO_CR3); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef INVARIANTS static unsigned long num_dirty_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, &num_dirty_emulations, 0, NULL); static unsigned long num_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, &num_accessed_emulations, 0, NULL); static unsigned long num_superpage_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, &num_superpage_accessed_emulations, 0, NULL); static unsigned long ad_emulation_superpage_promotions; SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, &ad_emulation_superpage_promotions, 0, NULL); #endif /* INVARIANTS */ int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) { int rv; struct rwlock *lock; #if VM_NRESERVLEVEL > 0 vm_page_t m, mpte; #endif pd_entry_t *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); if (!pmap_emulate_ad_bits(pmap)) return (-1); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); rv = -1; lock = NULL; PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) goto done; if ((*pde & PG_PS) != 0) { if (ftype == VM_PROT_READ) { #ifdef INVARIANTS atomic_add_long(&num_superpage_accessed_emulations, 1); #endif *pde |= PG_A; rv = 0; } goto done; } pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) goto done; if (ftype == VM_PROT_WRITE) { if ((*pte & PG_RW) == 0) goto done; /* * Set the modified and accessed bits simultaneously. * * Intel EPT PTEs that do software emulation of A/D bits map * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. * An EPT misconfiguration is triggered if the PTE is writable * but not readable (WR=10). This is avoided by setting PG_A * and PG_M simultaneously. */ *pte |= PG_M | PG_A; } else { *pte |= PG_A; } #if VM_NRESERVLEVEL > 0 /* try to promote the mapping */ if (va < VM_MAXUSER_ADDRESS) mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); else mpte = NULL; m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); if ((mpte == NULL || mpte->ref_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_pde(pmap, pde, va, &lock); #ifdef INVARIANTS atomic_add_long(&ad_emulation_superpage_promotions, 1); #endif } #endif #ifdef INVARIANTS if (ftype == VM_PROT_WRITE) atomic_add_long(&num_dirty_emulations, 1); else atomic_add_long(&num_accessed_emulations, 1); #endif rv = 0; /* success */ done: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) { pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; int idx; idx = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pml4 = pmap_pml4e(pmap, va); if (pml4 == NULL) goto done; ptr[idx++] = *pml4; if ((*pml4 & PG_V) == 0) goto done; pdp = pmap_pml4e_to_pdpe(pml4, va); ptr[idx++] = *pdp; if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) goto done; pde = pmap_pdpe_to_pde(pdp, va); ptr[idx++] = *pde; if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) goto done; pte = pmap_pde_to_pte(pde, va); ptr[idx++] = *pte; done: PMAP_UNLOCK(pmap); *num = idx; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; pt_entry_t *pte; int cache_bits, error __unused, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= dmaplimit)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); /* * NB: The sequence of updating a page table followed by accesses * to the corresponding pages used in the !DMAP case is subject to * the situation described in the "AMD64 Architecture Programmer's * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special * Coherency Considerations". Therefore, issuing the INVLPG right * after modifying the PTE bits is crucial. */ if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) { /* * Slow path, since we can get page faults * while mappings are active don't pin the * thread to the CPU and instead add a global * mapping visible to all CPUs. */ pmap_qenter(vaddr[i], &page[i], 1); } else { pte = vtopte(vaddr[i]); cache_bits = pmap_cache_bits(kernel_pmap, page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); invlpg(vaddr[i]); } } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) pmap_qremove(vaddr[i], 1); vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); } } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_paddr_t paddr; paddr = VM_PAGE_TO_PHYS(m); if (paddr < dmaplimit) return (PHYS_TO_DMAP(paddr)); mtx_lock_spin(&qframe_mtx); KASSERT(*vtopte(qframe) == 0, ("qframe busy")); pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); return (qframe); } void pmap_quick_remove_page(vm_offset_t addr) { if (addr != qframe) return; pte_store(vtopte(qframe), 0); invlpg(qframe); mtx_unlock_spin(&qframe_mtx); } /* * Pdp pages from the large map are managed differently from either * kernel or user page table pages. They are permanently allocated at * initialization time, and their reference count is permanently set to * zero. The pml4 entries pointing to those pages are copied into * each allocated pmap. * * In contrast, pd and pt pages are managed like user page table * pages. They are dynamically allocated, and their reference count * represents the number of valid entries within the page. */ static vm_page_t pmap_large_map_getptp_unlocked(void) { - return (pmap_alloc_pt_page(kernel_pmap, 0, - VM_ALLOC_NORMAL | VM_ALLOC_ZERO)); + return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO)); } static vm_page_t pmap_large_map_getptp(void) { vm_page_t m; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); m = pmap_large_map_getptp_unlocked(); if (m == NULL) { PMAP_UNLOCK(kernel_pmap); vm_wait(NULL); PMAP_LOCK(kernel_pmap); /* Callers retry. */ } return (m); } static pdp_entry_t * pmap_large_map_pdpe(vm_offset_t va) { vm_pindex_t pml4_idx; vm_paddr_t mphys; pml4_idx = pmap_pml4e_index(va); KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " "%#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " "LMSPML4I %#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); mphys = kernel_pml4[pml4_idx] & PG_FRAME; return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } static pd_entry_t * pmap_large_map_pde(vm_offset_t va) { pdp_entry_t *pdpe; vm_page_t m; vm_paddr_t mphys; retry: pdpe = pmap_large_map_pdpe(va); if (*pdpe == 0) { m = pmap_large_map_getptp(); if (m == NULL) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; } else { MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & PG_FRAME; } return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); } static pt_entry_t * pmap_large_map_pte(vm_offset_t va) { pd_entry_t *pde; vm_page_t m; vm_paddr_t mphys; retry: pde = pmap_large_map_pde(va); if (*pde == 0) { m = pmap_large_map_getptp(); if (m == NULL) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & PG_FRAME; } return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); } static vm_paddr_t pmap_large_map_kextract(vm_offset_t va) { pdp_entry_t *pdpe, pdp; pd_entry_t *pde, pd; pt_entry_t *pte, pt; KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), ("not largemap range %#lx", (u_long)va)); pdpe = pmap_large_map_pdpe(va); pdp = *pdpe; KASSERT((pdp & X86_PG_V) != 0, ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); if ((pdp & X86_PG_PS) != 0) { KASSERT((amd_feature & AMDID_PAGE1GB) != 0, ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); } pde = pmap_pdpe_to_pde(pdpe, va); pd = *pde; KASSERT((pd & X86_PG_V) != 0, ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); if ((pd & X86_PG_PS) != 0) return ((pd & PG_PS_FRAME) | (va & PDRMASK)); pte = pmap_pde_to_pte(pde, va); pt = *pte; KASSERT((pt & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); return ((pt & PG_FRAME) | (va & PAGE_MASK)); } static int pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, vmem_addr_t *vmem_res) { /* * Large mappings are all but static. Consequently, there * is no point in waiting for an earlier allocation to be * freed. */ return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); } int pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, vm_memattr_t mattr) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; vm_offset_t va, inc; vmem_addr_t vmem_res; vm_paddr_t pa; int error; if (len == 0 || spa + len < spa) return (EINVAL); /* See if DMAP can serve. */ if (spa + len <= dmaplimit) { va = PHYS_TO_DMAP(spa); *addr = (void *)va; return (pmap_change_attr(va, len, mattr)); } /* * No, allocate KVA. Fit the address with best possible * alignment for superpages. Fall back to worse align if * failed. */ error = ENOMEM; if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, NBPDP) >= roundup2(spa, NBPDP) + NBPDP) error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, &vmem_res); if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, NBPDR) + NBPDR) error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, &vmem_res); if (error != 0) error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); if (error != 0) return (error); /* * Fill pagetable. PG_M is not pre-set, we scan modified bits * in the pagetable to minimize flushing. No need to * invalidate TLB, since we only update invalid entries. */ PMAP_LOCK(kernel_pmap); for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, len -= inc) { if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { pdpe = pmap_large_map_pdpe(va); MPASS(*pdpe == 0); *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); inc = NBPDP; } else if (len >= NBPDR && (pa & PDRMASK) == 0 && (va & PDRMASK) == 0) { pde = pmap_large_map_pde(va); MPASS(*pde == 0); *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> ref_count++; inc = NBPDR; } else { pte = pmap_large_map_pte(va); MPASS(*pte == 0); *pte = pa | pg_g | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, FALSE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> ref_count++; inc = PAGE_SIZE; } } PMAP_UNLOCK(kernel_pmap); MPASS(len == 0); *addr = (void *)vmem_res; return (0); } void pmap_large_unmap(void *svaa, vm_size_t len) { vm_offset_t sva, va; vm_size_t inc; pdp_entry_t *pdpe, pdp; pd_entry_t *pde, pd; pt_entry_t *pte; vm_page_t m; struct spglist spgf; sva = (vm_offset_t)svaa; if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) return; SLIST_INIT(&spgf); KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); PMAP_LOCK(kernel_pmap); for (va = sva; va < sva + len; va += inc) { pdpe = pmap_large_map_pdpe(va); pdp = *pdpe; KASSERT((pdp & X86_PG_V) != 0, ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); if ((pdp & X86_PG_PS) != 0) { KASSERT((amd_feature & AMDID_PAGE1GB) != 0, ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); KASSERT((va & PDPMASK) == 0, ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); KASSERT(va + NBPDP <= sva + len, ("unmap covers partial 1GB page, sva %#lx va %#lx " "pdpe %#lx pdp %#lx len %#lx", sva, va, (u_long)pdpe, pdp, len)); *pdpe = 0; inc = NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, va); pd = *pde; KASSERT((pd & X86_PG_V) != 0, ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); if ((pd & X86_PG_PS) != 0) { KASSERT((va & PDRMASK) == 0, ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); KASSERT(va + NBPDR <= sva + len, ("unmap covers partial 2MB page, sva %#lx va %#lx " "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, pd, len)); pde_store(pde, 0); inc = NBPDR; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); m->ref_count--; if (m->ref_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } continue; } pte = pmap_pde_to_pte(pde, va); KASSERT((*pte & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, *pte)); pte_clear(pte); inc = PAGE_SIZE; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); m->ref_count--; if (m->ref_count == 0) { *pde = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); m->ref_count--; if (m->ref_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } } } pmap_invalidate_range(kernel_pmap, sva, sva + len); PMAP_UNLOCK(kernel_pmap); vm_page_free_pages_toq(&spgf, false); vmem_free(large_vmem, sva, len); } static void pmap_large_map_wb_fence_mfence(void) { mfence(); } static void pmap_large_map_wb_fence_atomic(void) { atomic_thread_fence_seq_cst(); } static void pmap_large_map_wb_fence_nop(void) { } DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) { if (cpu_vendor_id != CPU_VENDOR_INTEL) return (pmap_large_map_wb_fence_mfence); else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | CPUID_STDEXT_CLFLUSHOPT)) == 0) return (pmap_large_map_wb_fence_atomic); else /* clflush is strongly enough ordered */ return (pmap_large_map_wb_fence_nop); } static void pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clwb(va); } static void pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clflushopt(va); } static void pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clflush(va); } static void pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) { } DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) { if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) return (pmap_large_map_flush_range_clwb); else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) return (pmap_large_map_flush_range_clflushopt); else if ((cpu_feature & CPUID_CLFSH) != 0) return (pmap_large_map_flush_range_clflush); else return (pmap_large_map_flush_range_nop); } static void pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) { volatile u_long *pe; u_long p; vm_offset_t va; vm_size_t inc; bool seen_other; for (va = sva; va < eva; va += inc) { inc = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) { pe = (volatile u_long *)pmap_large_map_pdpe(va); p = *pe; if ((p & X86_PG_PS) != 0) inc = NBPDP; } if (inc == 0) { pe = (volatile u_long *)pmap_large_map_pde(va); p = *pe; if ((p & X86_PG_PS) != 0) inc = NBPDR; } if (inc == 0) { pe = (volatile u_long *)pmap_large_map_pte(va); p = *pe; inc = PAGE_SIZE; } seen_other = false; for (;;) { if ((p & X86_PG_AVAIL1) != 0) { /* * Spin-wait for the end of a parallel * write-back. */ cpu_spinwait(); p = *pe; /* * If we saw other write-back * occuring, we cannot rely on PG_M to * indicate state of the cache. The * PG_M bit is cleared before the * flush to avoid ignoring new writes, * and writes which are relevant for * us might happen after. */ seen_other = true; continue; } if ((p & X86_PG_M) != 0 || seen_other) { if (!atomic_fcmpset_long(pe, &p, (p & ~X86_PG_M) | X86_PG_AVAIL1)) /* * If we saw PG_M without * PG_AVAIL1, and then on the * next attempt we do not * observe either PG_M or * PG_AVAIL1, the other * write-back started after us * and finished before us. We * can rely on it doing our * work. */ continue; pmap_large_map_flush_range(va, inc); atomic_clear_long(pe, X86_PG_AVAIL1); } break; } maybe_yield(); } } /* * Write-back cache lines for the given address range. * * Must be called only on the range or sub-range returned from * pmap_large_map(). Must not be called on the coalesced ranges. * * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH * instructions support. */ void pmap_large_map_wb(void *svap, vm_size_t len) { vm_offset_t eva, sva; sva = (vm_offset_t)svap; eva = sva + len; pmap_large_map_wb_fence(); if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { pmap_large_map_flush_range(sva, len); } else { KASSERT(sva >= LARGEMAP_MIN_ADDRESS && eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); pmap_large_map_wb_large(sva, eva); } pmap_large_map_wb_fence(); } static vm_page_t pmap_pti_alloc_page(void) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); return (m); } static bool pmap_pti_free_page(vm_page_t m) { KASSERT(m->ref_count > 0, ("page %p not referenced", m)); if (!vm_page_unwire_noq(m)) return (false); vm_page_free_zero(m); return (true); } static void pmap_pti_init(void) { vm_page_t pml4_pg; pdp_entry_t *pdpe; vm_offset_t va; int i; if (!pti) return; pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); VM_OBJECT_WLOCK(pti_obj); pml4_pg = pmap_pti_alloc_page(); pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { pdpe = pmap_pti_pdpe(va); pmap_pti_wire_pte(pdpe); } pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + sizeof(struct gate_descriptor) * NIDT, false); CPU_FOREACH(i) { /* Doublefault stack IST 1 */ va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false); /* NMI stack IST 2 */ va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false); /* MC# stack IST 3 */ va = __pcpu[i].pc_common_tss.tss_ist3 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false); /* DB# stack IST 4 */ va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); } pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, true); pti_finalized = true; VM_OBJECT_WUNLOCK(pti_obj); } SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); static pdp_entry_t * pmap_pti_pdpe(vm_offset_t va) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; vm_page_t m; vm_pindex_t pml4_idx; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pml4_idx = pmap_pml4e_index(va); pml4e = &pti_pml4[pml4_idx]; m = NULL; if (*pml4e == 0) { if (pti_finalized) panic("pml4 alloc after finalization\n"); m = pmap_pti_alloc_page(); if (*pml4e != 0) { pmap_pti_free_page(m); mphys = *pml4e & ~PAGE_MASK; } else { mphys = VM_PAGE_TO_PHYS(m); *pml4e = mphys | X86_PG_RW | X86_PG_V; } } else { mphys = *pml4e & ~PAGE_MASK; } pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); return (pdpe); } static void pmap_pti_wire_pte(void *pte) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); m->ref_count++; } static void pmap_pti_unwire_pde(void *pde, bool only_ref) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); MPASS(m->ref_count > 0); MPASS(only_ref || m->ref_count > 1); pmap_pti_free_page(m); } static void pmap_pti_unwire_pte(void *pte, vm_offset_t va) { vm_page_t m; pd_entry_t *pde; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); MPASS(m->ref_count > 0); if (pmap_pti_free_page(m)) { pde = pmap_pti_pde(va); MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); *pde = 0; pmap_pti_unwire_pde(pde, false); } } static pd_entry_t * pmap_pti_pde(vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_page_t m; vm_pindex_t pd_idx; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pdpe = pmap_pti_pdpe(va); if (*pdpe == 0) { m = pmap_pti_alloc_page(); if (*pdpe != 0) { pmap_pti_free_page(m); MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & ~PAGE_MASK; } else { mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_RW | X86_PG_V; } } else { MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & ~PAGE_MASK; } pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); pd_idx = pmap_pde_index(va); pde += pd_idx; return (pde); } static pt_entry_t * pmap_pti_pte(vm_offset_t va, bool *unwire_pde) { pd_entry_t *pde; pt_entry_t *pte; vm_page_t m; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pde = pmap_pti_pde(va); if (unwire_pde != NULL) { *unwire_pde = true; pmap_pti_wire_pte(pde); } if (*pde == 0) { m = pmap_pti_alloc_page(); if (*pde != 0) { pmap_pti_free_page(m); MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & ~(PAGE_MASK | pg_nx); } else { mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_RW | X86_PG_V; if (unwire_pde != NULL) *unwire_pde = false; } } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & ~(PAGE_MASK | pg_nx); } pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); pte += pmap_pte_index(va); return (pte); } static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) { vm_paddr_t pa; pd_entry_t *pde; pt_entry_t *pte, ptev; bool unwire_pde; VM_OBJECT_ASSERT_WLOCKED(pti_obj); sva = trunc_page(sva); MPASS(sva > VM_MAXUSER_ADDRESS); eva = round_page(eva); MPASS(sva < eva); for (; sva < eva; sva += PAGE_SIZE) { pte = pmap_pti_pte(sva, &unwire_pde); pa = pmap_kextract(sva); ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); if (*pte == 0) { pte_store(pte, ptev); pmap_pti_wire_pte(pte); } else { KASSERT(!pti_finalized, ("pti overlap after fin %#lx %#lx %#lx", sva, *pte, ptev)); KASSERT(*pte == ptev, ("pti non-identical pte after fin %#lx %#lx %#lx", sva, *pte, ptev)); } if (unwire_pde) { pde = pmap_pti_pde(sva); pmap_pti_unwire_pde(pde, true); } } } void pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) { if (!pti) return; VM_OBJECT_WLOCK(pti_obj); pmap_pti_add_kva_locked(sva, eva, exec); VM_OBJECT_WUNLOCK(pti_obj); } void pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) { pt_entry_t *pte; vm_offset_t va; if (!pti) return; sva = rounddown2(sva, PAGE_SIZE); MPASS(sva > VM_MAXUSER_ADDRESS); eva = roundup2(eva, PAGE_SIZE); MPASS(sva < eva); VM_OBJECT_WLOCK(pti_obj); for (va = sva; va < eva; va += PAGE_SIZE) { pte = pmap_pti_pte(va, NULL); KASSERT((*pte & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, *pte)); pte_clear(pte); pmap_pti_unwire_pte(pte, va); } pmap_invalidate_range(kernel_pmap, sva, eva); VM_OBJECT_WUNLOCK(pti_obj); } static void * pkru_dup_range(void *ctx __unused, void *data) { struct pmap_pkru_range *node, *new_node; new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); if (new_node == NULL) return (NULL); node = data; memcpy(new_node, node, sizeof(*node)); return (new_node); } static void pkru_free_range(void *ctx __unused, void *node) { uma_zfree(pmap_pkru_ranges_zone, node); } static int pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { struct pmap_pkru_range *ppr; int error; PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); if ((flags & AMD64_PKRU_EXCL) != 0 && !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) return (EBUSY); ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); if (ppr == NULL) return (ENOMEM); ppr->pkru_keyidx = keyidx; ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); if (error != 0) uma_zfree(pmap_pkru_ranges_zone, ppr); return (error); } static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); return (rangeset_remove(&pmap->pm_pkru, sva, eva)); } static void pmap_pkru_deassign_all(pmap_t pmap) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_remove_all(&pmap->pm_pkru); } static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct pmap_pkru_range *ppr, *prev_ppr; vm_offset_t va; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || sva >= VM_MAXUSER_ADDRESS) return (true); MPASS(eva <= VM_MAXUSER_ADDRESS); for (va = sva; va < eva; prev_ppr = ppr) { ppr = rangeset_lookup(&pmap->pm_pkru, va); if (va == sva) prev_ppr = ppr; else if ((ppr == NULL) ^ (prev_ppr == NULL)) return (false); if (ppr == NULL) { va += PAGE_SIZE; continue; } if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) return (false); va = ppr->pkru_rs_el.re_end; } return (true); } static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va) { struct pmap_pkru_range *ppr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || va >= VM_MAXUSER_ADDRESS) return (0); ppr = rangeset_lookup(&pmap->pm_pkru, va); if (ppr != NULL) return (X86_PG_PKU(ppr->pkru_keyidx)); return (0); } static bool pred_pkru_on_remove(void *ctx __unused, void *r) { struct pmap_pkru_range *ppr; ppr = r; return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); } static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_remove_pred(&pmap->pm_pkru, sva, eva, pred_pkru_on_remove); } } static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) { PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); MPASS(dst_pmap->pm_type == PT_X86); MPASS(src_pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); if (src_pmap->pm_pkru.rs_data_ctx == NULL) return (0); return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); } static void pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t newpde, ptpaddr, *pde; pt_entry_t newpte, *ptep, pte; vm_offset_t va, va_next; bool changed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS(keyidx <= PMAP_MAX_PKRU_IDX); for (changed = false, va = sva; va < eva; va = va_next) { pml4e = pmap_pml4e(pmap, va); if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) { va_next = (va + NBPML4) & ~PML4MASK; if (va_next < va) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, va); if ((*pdpe & X86_PG_V) == 0) { va_next = (va + NBPDP) & ~PDPMASK; if (va_next < va) va_next = eva; continue; } va_next = (va + NBPDR) & ~PDRMASK; if (va_next < va) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, va); ptpaddr = *pde; if (ptpaddr == 0) continue; MPASS((ptpaddr & X86_PG_V) != 0); if ((ptpaddr & PG_PS) != 0) { if (va + NBPDR == va_next && eva >= va_next) { newpde = (ptpaddr & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); if (newpde != ptpaddr) { *pde = newpde; changed = true; } continue; } else if (!pmap_demote_pde(pmap, pde, va)) { continue; } } if (va_next > eva) va_next = eva; for (ptep = pmap_pde_to_pte(pde, va); va != va_next; ptep++, va += PAGE_SIZE) { pte = *ptep; if ((pte & X86_PG_V) == 0) continue; newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); if (newpte != pte) { *ptep = newpte; changed = true; } } } if (changed) pmap_invalidate_range(pmap, sva, eva); } static int pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) return (EINVAL); if (eva <= sva || eva > VM_MAXUSER_ADDRESS) return (EFAULT); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (ENOTSUP); return (0); } int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { int error; sva = trunc_page(sva); eva = round_page(eva); error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); if (error != 0) return (error); for (;;) { PMAP_LOCK(pmap); error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); if (error == 0) pmap_pkru_update_range(pmap, sva, eva, keyidx); PMAP_UNLOCK(pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { int error; sva = trunc_page(sva); eva = round_page(eva); error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); if (error != 0) return (error); for (;;) { PMAP_LOCK(pmap); error = pmap_pkru_deassign(pmap, sva, eva); if (error == 0) pmap_pkru_update_range(pmap, sva, eva, 0); PMAP_UNLOCK(pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } #ifdef KASAN static vm_page_t pmap_kasan_enter_alloc_4k(void) { vm_page_t m; - m = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO); + m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); if (m == NULL) panic("%s: no memory to grow shadow map", __func__); - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(m); return (m); } static vm_page_t pmap_kasan_enter_alloc_2m(void) { vm_page_t m; m = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT); if (m != NULL) memset((void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 0, NBPDR); return (m); } /* * Grow the shadow map by at least one 4KB page at the specified address. Use * 2MB pages when possible. */ void pmap_kasan_enter(vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; vm_page_t m; mtx_assert(&kernel_map->system_mtx, MA_OWNED); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) { m = pmap_kasan_enter_alloc_4k(); *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | pg_nx); } pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) == 0) { m = pmap_kasan_enter_alloc_2m(); if (m != NULL) { *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx); } else { m = pmap_kasan_enter_alloc_4k(); *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | pg_nx); } } if ((*pde & X86_PG_PS) != 0) return; pte = pmap_pde_to_pte(pde, va); if ((*pte & X86_PG_V) != 0) return; m = pmap_kasan_enter_alloc_4k(); *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | pg_nx); } #endif #ifdef KMSAN static vm_page_t pmap_kmsan_enter_alloc_4k(void) { vm_page_t m; - m = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO); + m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); if (m == NULL) panic("%s: no memory to grow shadow map", __func__); - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(m); return (m); } static vm_page_t pmap_kmsan_enter_alloc_2m(void) { vm_page_t m; m = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT); if (m != NULL) memset((void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 0, NBPDR); return (m); } /* * Grow the shadow or origin maps by at least one 4KB page at the specified * address. Use 2MB pages when possible. */ void pmap_kmsan_enter(vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; vm_page_t m; mtx_assert(&kernel_map->system_mtx, MA_OWNED); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) { m = pmap_kmsan_enter_alloc_4k(); *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | pg_nx); } pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) == 0) { m = pmap_kmsan_enter_alloc_2m(); if (m != NULL) { *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx); } else { m = pmap_kmsan_enter_alloc_4k(); *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | pg_nx); } } if ((*pde & X86_PG_PS) != 0) return; pte = pmap_pde_to_pte(pde, va); if ((*pte & X86_PG_V) != 0) return; m = pmap_kmsan_enter_alloc_4k(); *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | pg_nx); } #endif /* * Track a range of the kernel's virtual address space that is contiguous * in various mapping attributes. */ struct pmap_kernel_map_range { vm_offset_t sva; pt_entry_t attrs; int ptes; int pdes; int pdpes; }; static void sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t eva) { const char *mode; int i, pat_idx; if (eva <= range->sva) return; pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); for (i = 0; i < PAT_INDEX_SIZE; i++) if (pat_index[i] == pat_idx) break; switch (i) { case PAT_WRITE_BACK: mode = "WB"; break; case PAT_WRITE_THROUGH: mode = "WT"; break; case PAT_UNCACHEABLE: mode = "UC"; break; case PAT_UNCACHED: mode = "U-"; break; case PAT_WRITE_PROTECTED: mode = "WP"; break; case PAT_WRITE_COMBINING: mode = "WC"; break; default: printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", __func__, pat_idx, range->sva, eva); mode = "??"; break; } sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", range->sva, eva, (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', (range->attrs & pg_nx) != 0 ? '-' : 'x', (range->attrs & X86_PG_U) != 0 ? 'u' : 's', (range->attrs & X86_PG_G) != 0 ? 'g' : '-', mode, range->pdpes, range->pdes, range->ptes); /* Reset to sentinel value. */ range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); } /* * Determine whether the attributes specified by a page table entry match those * being tracked by the current range. This is not quite as simple as a direct * flag comparison since some PAT modes have multiple representations. */ static bool sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) { pt_entry_t diff, mask; mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; diff = (range->attrs ^ attrs) & mask; if (diff == 0) return (true); if ((diff & ~X86_PG_PDE_PAT) == 0 && pmap_pat_index(kernel_pmap, range->attrs, true) == pmap_pat_index(kernel_pmap, attrs, true)) return (true); return (false); } static void sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, pt_entry_t attrs) { memset(range, 0, sizeof(*range)); range->sva = va; range->attrs = attrs; } /* * Given a leaf PTE, derive the mapping's attributes. If they do not match * those of the current run, dump the address range and its attributes, and * begin a new run. */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, pt_entry_t pte) { pt_entry_t attrs; attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); attrs |= pdpe & pg_nx; attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); if ((pdpe & PG_PS) != 0) { attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); } else if (pde != 0) { attrs |= pde & pg_nx; attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); } if ((pde & PG_PS) != 0) { attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); } else if (pte != 0) { attrs |= pte & pg_nx; attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); /* Canonicalize by always using the PDE PAT bit. */ if ((attrs & X86_PG_PTE_PAT) != 0) attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; } if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { sysctl_kmaps_dump(sb, range, va); sysctl_kmaps_reinit(range, va, attrs); } } static int sysctl_kmaps(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; pml4_entry_t pml4e; pdp_entry_t *pdp, pdpe; pd_entry_t *pd, pde; pt_entry_t *pt, pte; vm_offset_t sva; vm_paddr_t pa; int error, i, j, k, l; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = &sbuf; sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); /* * Iterate over the kernel page tables without holding the kernel pmap * lock. Outside of the large map, kernel page table pages are never * freed, so at worst we will observe inconsistencies in the output. * Within the large map, ensure that PDP and PD page addresses are * valid before descending. */ for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { switch (i) { case PML4PML4I: sbuf_printf(sb, "\nRecursive map:\n"); break; case DMPML4I: sbuf_printf(sb, "\nDirect map:\n"); break; #ifdef KASAN case KASANPML4I: sbuf_printf(sb, "\nKASAN shadow map:\n"); break; #endif #ifdef KMSAN case KMSANSHADPML4I: sbuf_printf(sb, "\nKMSAN shadow map:\n"); break; case KMSANORIGPML4I: sbuf_printf(sb, "\nKMSAN origin map:\n"); break; #endif case KPML4BASE: sbuf_printf(sb, "\nKernel map:\n"); break; case LMSPML4I: sbuf_printf(sb, "\nLarge map:\n"); break; } /* Convert to canonical form. */ if (sva == 1ul << 47) sva |= -1ul << 48; restart: pml4e = kernel_pml4[i]; if ((pml4e & X86_PG_V) == 0) { sva = rounddown2(sva, NBPML4); sysctl_kmaps_dump(sb, &range, sva); sva += NBPML4; continue; } pa = pml4e & PG_FRAME; pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { pdpe = pdp[j]; if ((pdpe & X86_PG_V) == 0) { sva = rounddown2(sva, NBPDP); sysctl_kmaps_dump(sb, &range, sva); sva += NBPDP; continue; } pa = pdpe & PG_FRAME; if ((pdpe & PG_PS) != 0) { sva = rounddown2(sva, NBPDP); sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 0, 0); range.pdpes++; sva += NBPDP; continue; } if (PMAP_ADDRESS_IN_LARGEMAP(sva) && vm_phys_paddr_to_vm_page(pa) == NULL) { /* * Page table pages for the large map may be * freed. Validate the next-level address * before descending. */ goto restart; } pd = (pd_entry_t *)PHYS_TO_DMAP(pa); for (k = pmap_pde_index(sva); k < NPDEPG; k++) { pde = pd[k]; if ((pde & X86_PG_V) == 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_dump(sb, &range, sva); sva += NBPDR; continue; } pa = pde & PG_FRAME; if ((pde & PG_PS) != 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, pde, 0); range.pdes++; sva += NBPDR; continue; } if (PMAP_ADDRESS_IN_LARGEMAP(sva) && vm_phys_paddr_to_vm_page(pa) == NULL) { /* * Page table pages for the large map * may be freed. Validate the * next-level address before descending. */ goto restart; } pt = (pt_entry_t *)PHYS_TO_DMAP(pa); for (l = pmap_pte_index(sva); l < NPTEPG; l++, sva += PAGE_SIZE) { pte = pt[l]; if ((pte & X86_PG_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); continue; } sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, pde, pte); range.ptes++; } } } } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, NULL, 0, sysctl_kmaps, "A", "Dump kernel address layout"); #ifdef DDB DB_SHOW_COMMAND(pte, pmap_print_pte) { pmap_t pmap; pml5_entry_t *pml5; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_offset_t va; if (!have_addr) { db_printf("show pte addr\n"); return; } va = (vm_offset_t)addr; if (kdb_thread != NULL) pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); else pmap = PCPU_GET(curpmap); PG_V = pmap_valid_bit(pmap); db_printf("VA 0x%016lx", va); if (pmap_is_la57(pmap)) { pml5 = pmap_pml5e(pmap, va); db_printf(" pml5e 0x%016lx", *pml5); if ((*pml5 & PG_V) == 0) { db_printf("\n"); return; } pml4 = pmap_pml5e_to_pml4e(pml5, va); } else { pml4 = pmap_pml4e(pmap, va); } db_printf(" pml4e 0x%016lx", *pml4); if ((*pml4 & PG_V) == 0) { db_printf("\n"); return; } pdp = pmap_pml4e_to_pdpe(pml4, va); db_printf(" pdpe 0x%016lx", *pdp); if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { db_printf("\n"); return; } pde = pmap_pdpe_to_pde(pdp, va); db_printf(" pde 0x%016lx", *pde); if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { db_printf("\n"); return; } pte = pmap_pde_to_pte(pde, va); db_printf(" pte 0x%016lx\n", *pte); } DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) { vm_paddr_t a; if (have_addr) { a = (vm_paddr_t)addr; db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); } else { db_printf("show phys2dmap addr\n"); } } static void ptpages_show_page(int level, int idx, vm_page_t pg) { db_printf("l %d i %d pg %p phys %#lx ref %x\n", level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); } static void ptpages_show_complain(int level, int idx, uint64_t pte) { db_printf("l %d i %d pte %#lx\n", level, idx, pte); } static void ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) { vm_page_t pg3, pg2, pg1; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; int i4, i3, i2; pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); for (i4 = 0; i4 < num_entries; i4++) { if ((pml4[i4] & PG_V) == 0) continue; pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); if (pg3 == NULL) { ptpages_show_complain(3, i4, pml4[i4]); continue; } ptpages_show_page(3, i4, pg3); pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); for (i3 = 0; i3 < NPDPEPG; i3++) { if ((pdp[i3] & PG_V) == 0) continue; pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); if (pg3 == NULL) { ptpages_show_complain(2, i3, pdp[i3]); continue; } ptpages_show_page(2, i3, pg2); pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); for (i2 = 0; i2 < NPDEPG; i2++) { if ((pd[i2] & PG_V) == 0) continue; pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); if (pg1 == NULL) { ptpages_show_complain(1, i2, pd[i2]); continue; } ptpages_show_page(1, i2, pg1); } } } } DB_SHOW_COMMAND(ptpages, pmap_ptpages) { pmap_t pmap; vm_page_t pg; pml5_entry_t *pml5; uint64_t PG_V; int i5; if (have_addr) pmap = (pmap_t)addr; else pmap = PCPU_GET(curpmap); PG_V = pmap_valid_bit(pmap); if (pmap_is_la57(pmap)) { pml5 = pmap->pm_pmltop; for (i5 = 0; i5 < NUPML5E; i5++) { if ((pml5[i5] & PG_V) == 0) continue; pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); if (pg == NULL) { ptpages_show_complain(4, i5, pml5[i5]); continue; } ptpages_show_page(4, i5, pg); ptpages_show_pml4(pg, NPML4EPG, PG_V); } } else { ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); } } #endif diff --git a/sys/amd64/amd64/uma_machdep.c b/sys/amd64/amd64/uma_machdep.c index 2fc981f23c83..479cb4e37d9e 100644 --- a/sys/amd64/amd64/uma_machdep.c +++ b/sys/amd64/amd64/uma_machdep.c @@ -1,76 +1,74 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; - m = vm_page_alloc_domain(NULL, 0, domain, - malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); + m = vm_page_alloc_noobj_domain(domain, malloc2vm_flags(wait) | + VM_ALLOC_WIRED); if (m == NULL) return (NULL); pa = m->phys_addr; if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)PHYS_TO_DMAP(pa); - if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) - pagezero(va); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = DMAP_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); vm_page_unwire_noq(m); vm_page_free(m); } diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index f596ace35ba7..51438274f1ff 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -1,6955 +1,6955 @@ /*- * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause-FreeBSD * * Copyright (c) 1991 Regents of the University of California. * Copyright (c) 1994 John S. Dyson * Copyright (c) 1994 David Greenman * Copyright (c) 2005-2010 Alan L. Cox * Copyright (c) 2014-2016 Svatopluk Kraus * Copyright (c) 2014-2016 Michal Meloun * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_vm.h" #include "opt_pmap.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #ifndef DIAGNOSTIC #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #ifdef PMAP_DEBUG static void pmap_zero_page_check(vm_page_t m); void pmap_debug(int level); int pmap_pid_dump(int pid); #define PDEBUG(_lev_,_stat_) \ if (pmap_debug_level >= (_lev_)) \ ((_stat_)) #define dprintf printf int pmap_debug_level = 1; #else /* PMAP_DEBUG */ #define PDEBUG(_lev_,_stat_) /* Nothing */ #define dprintf(x, arg...) #endif /* PMAP_DEBUG */ /* * Level 2 page tables map definion ('max' is excluded). */ #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) #define UPT2V_MAX_ADDRESS \ ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) /* * Promotion to a 1MB (PTE1) page mapping requires that the corresponding * 4KB (PTE2) page mappings have identical settings for the following fields: */ #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ PTE2_ATTR_MASK) #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ PTE1_ATTR_MASK) #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ (((l2_attr) & L2_C) ? L1_S_C : 0) | \ (((l2_attr) & L2_B) ? L1_S_B : 0) | \ (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ (((l2_attr) & PTE2_W) ? PTE1_W : 0)) #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ (((l1_attr) & L1_S_C) ? L2_C : 0) | \ (((l1_attr) & L1_S_B) ? L2_B : 0) | \ (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ (((l1_attr) & PTE1_W) ? PTE2_W : 0)) /* * PTE2 descriptors creation macros. */ #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) #define PV_STATS #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif /* * The boot_pt1 is used temporary in very early boot stage as L1 page table. * We can init many things with no memory allocation thanks to its static * allocation and this brings two main advantages: * (1) other cores can be started very simply, * (2) various boot loaders can be supported as its arguments can be processed * in virtual address space and can be moved to safe location before * first allocation happened. * Only disadvantage is that boot_pt1 is used only in very early boot stage. * However, the table is uninitialized and so lays in bss. Therefore kernel * image size is not influenced. * * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and * CPU suspend/resume game. */ extern pt1_entry_t boot_pt1[]; vm_paddr_t base_pt1; pt1_entry_t *kern_pt1; pt2_entry_t *kern_pt2tab; pt2_entry_t *PT2MAP; static uint32_t ttb_flags; static vm_memattr_t pt_memattr; ttb_entry_t pmap_kern_ttb; struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static vm_offset_t kernel_vm_end_new; vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; vm_offset_t vm_max_kernel_address; vm_paddr_t kernel_l1pa; static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ vm_paddr_t first_managed_pa; #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) /* * All those kernel PT submaps that BSD is so fond of */ caddr_t _tmppt = 0; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt2_entry_t *PMAP1 = NULL, *PMAP2; static pt2_entry_t *PADDR1 = NULL, *PADDR2; #ifdef DDB static pt2_entry_t *PMAP3; static pt2_entry_t *PADDR3; static int PMAP3cpu __unused; /* for SMP only */ #endif #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte2_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte2_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte2_quick didn't change PMAP1"); static struct mtx PMAP2mutex; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static __inline void pt2_wirecount_init(vm_page_t m); static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va); static int pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags, vm_page_t m); void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); /* * Function to set the debug level of the pmap code. */ #ifdef PMAP_DEBUG void pmap_debug(int level) { pmap_debug_level = level; dprintf("pmap_debug: level=%d\n", pmap_debug_level); } #endif /* PMAP_DEBUG */ /* * This table must corespond with memory attribute configuration in vm.h. * First entry is used for normal system mapping. * * Device memory is always marked as shared. * Normal memory is shared only in SMP . * Not outer shareable bits are not used yet. * Class 6 cannot be used on ARM11. */ #define TEXDEF_TYPE_SHIFT 0 #define TEXDEF_TYPE_MASK 0x3 #define TEXDEF_INNER_SHIFT 2 #define TEXDEF_INNER_MASK 0x3 #define TEXDEF_OUTER_SHIFT 4 #define TEXDEF_OUTER_MASK 0x3 #define TEXDEF_NOS_SHIFT 6 #define TEXDEF_NOS_MASK 0x1 #define TEX(t, i, o, s) \ ((t) << TEXDEF_TYPE_SHIFT) | \ ((i) << TEXDEF_INNER_SHIFT) | \ ((o) << TEXDEF_OUTER_SHIFT | \ ((s) << TEXDEF_NOS_SHIFT)) static uint32_t tex_class[8] = { /* type inner cache outer cache */ TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ }; #undef TEX static uint32_t pte2_attr_tab[8] = { PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 0, /* 5 - NOT USED YET */ 0, /* 6 - NOT USED YET */ 0 /* 7 - NOT USED YET */ }; CTASSERT(VM_MEMATTR_WB_WA == 0); CTASSERT(VM_MEMATTR_NOCACHE == 1); CTASSERT(VM_MEMATTR_DEVICE == 2); CTASSERT(VM_MEMATTR_SO == 3); CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); #define VM_MEMATTR_END (VM_MEMATTR_WRITE_THROUGH + 1) boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= 0 && mode < VM_MEMATTR_END); } static inline uint32_t vm_memattr_to_pte2(vm_memattr_t ma) { KASSERT((u_int)ma < VM_MEMATTR_END, ("%s: bad vm_memattr_t %d", __func__, ma)); return (pte2_attr_tab[(u_int)ma]); } static inline uint32_t vm_page_pte2_attr(vm_page_t m) { return (vm_memattr_to_pte2(m->md.pat_mode)); } /* * Convert TEX definition entry to TTB flags. */ static uint32_t encode_ttb_flags(int idx) { uint32_t inner, outer, nos, reg; inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & TEXDEF_INNER_MASK; outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & TEXDEF_OUTER_MASK; nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & TEXDEF_NOS_MASK; reg = nos << 5; reg |= outer << 3; if (cpuinfo.coherent_walk) reg |= (inner & 0x1) << 6; reg |= (inner & 0x2) >> 1; #ifdef SMP ARM_SMP_UP( reg |= 1 << 1, ); #endif return reg; } /* * Set TEX remapping registers in current CPU. */ void pmap_set_tex(void) { uint32_t prrr, nmrr; uint32_t type, inner, outer, nos; int i; #ifdef PMAP_PTE_NOCACHE /* XXX fixme */ if (cpuinfo.coherent_walk) { pt_memattr = VM_MEMATTR_WB_WA; ttb_flags = encode_ttb_flags(0); } else { pt_memattr = VM_MEMATTR_NOCACHE; ttb_flags = encode_ttb_flags(1); } #else pt_memattr = VM_MEMATTR_WB_WA; ttb_flags = encode_ttb_flags(0); #endif prrr = 0; nmrr = 0; /* Build remapping register from TEX classes. */ for (i = 0; i < 8; i++) { type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & TEXDEF_TYPE_MASK; inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & TEXDEF_INNER_MASK; outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & TEXDEF_OUTER_MASK; nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & TEXDEF_NOS_MASK; prrr |= type << (i * 2); prrr |= nos << (i + 24); nmrr |= inner << (i * 2); nmrr |= outer << (i * 2 + 16); } /* Add shareable bits for device memory. */ prrr |= PRRR_DS0 | PRRR_DS1; /* Add shareable bits for normal memory in SMP case. */ #ifdef SMP ARM_SMP_UP( prrr |= PRRR_NS1, ); #endif cp15_prrr_set(prrr); cp15_nmrr_set(nmrr); /* Caches are disabled, so full TLB flush should be enough. */ tlb_flush_all_local(); } /* * Remap one vm_meattr class to another one. This can be useful as * workaround for SOC errata, e.g. if devices must be accessed using * SO memory class. * * !!! Please note that this function is absolutely last resort thing. * It should not be used under normal circumstances. !!! * * Usage rules: * - it shall be called after pmap_bootstrap_prepare() and before * cpu_mp_start() (thus only on boot CPU). In practice, it's expected * to be called from platform_attach() or platform_late_init(). * * - if remapping doesn't change caching mode, or until uncached class * is remapped to any kind of cached one, then no other restriction exists. * * - if pmap_remap_vm_attr() changes caching mode, but both (original and * remapped) remain cached, then caller is resposible for calling * of dcache_wbinv_poc_all(). * * - remapping of any kind of cached class to uncached is not permitted. */ void pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) { int old_idx, new_idx; /* Map VM memattrs to indexes to tex_class table. */ old_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)old_attr]); new_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)new_attr]); /* Replace TEX attribute and apply it. */ tex_class[old_idx] = tex_class[new_idx]; pmap_set_tex(); } /* * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, * KERNBASE is mapped by first L2 page table in L2 page table page. It * meets same constrain due to PT2MAP being placed just under KERNBASE. */ CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); /* * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. * For now, anyhow, the following check must be fulfilled. */ CTASSERT(PAGE_SIZE == PTE2_SIZE); /* * We don't want to mess up MI code with all MMU and PMAP definitions, * so some things, which depend on other ones, are defined independently. * Now, it is time to check that we don't screw up something. */ CTASSERT(PDRSHIFT == PTE1_SHIFT); /* * Check L1 and L2 page table entries definitions consistency. */ CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); /* * Check L2 page tables page consistency. */ CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); /* * Check PT2TAB consistency. * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. * This should be done without remainder. */ CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); /* * A PT2MAP magic. * * All level 2 page tables (PT2s) are mapped continuously and accordingly * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page * must be used together, but not necessary at once. The first PT2 in a page * must map things on correctly aligned address and the others must follow * in right order. */ #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) /* * Check PT2TAB consistency. * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. * The both should be done without remainder. */ CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); /* * The implementation was made general, however, with the assumption * bellow in mind. In case of another value of NPG_IN_PT2TAB, * the code should be once more rechecked. */ CTASSERT(NPG_IN_PT2TAB == 1); /* * Get offset of PT2 in a page * associated with given PT1 index. */ static __inline u_int page_pt2off(u_int pt1_idx) { return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); } /* * Get physical address of PT2 * associated with given PT2s page and PT1 index. */ static __inline vm_paddr_t page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) { return (pgpa + page_pt2off(pt1_idx)); } /* * Get first entry of PT2 * associated with given PT2s page and PT1 index. */ static __inline pt2_entry_t * page_pt2(vm_offset_t pgva, u_int pt1_idx) { return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); } /* * Get virtual address of PT2s page (mapped in PT2MAP) * which holds PT2 which holds entry which maps given virtual address. */ static __inline vm_offset_t pt2map_pt2pg(vm_offset_t va) { va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); return ((vm_offset_t)pt2map_entry(va)); } /***************************************************************************** * * THREE pmap initialization milestones exist: * * locore.S * -> fundamental init (including MMU) in ASM * * initarm() * -> fundamental init continues in C * -> first available physical address is known * * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) * -> basic (safe) interface for physical address allocation is made * -> basic (safe) interface for virtual mapping is made * -> limited not SMP coherent work is possible * * -> more fundamental init continues in C * -> locks and some more things are available * -> all fundamental allocations and mappings are done * * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) * -> phys_avail[] and virtual_avail is set * -> control is passed to vm subsystem * -> physical and virtual address allocation are off limit * -> low level mapping functions, some SMP coherent, * are available, which cannot be used before vm subsystem * is being inited * * mi_startup() * -> vm subsystem is being inited * * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) * -> pmap is fully inited * *****************************************************************************/ /***************************************************************************** * * PMAP first stage initialization and utility functions * for pre-bootstrap epoch. * * After pmap_bootstrap_prepare() is called, the following functions * can be used: * * (1) strictly only for this stage functions for physical page allocations, * virtual space allocations, and mappings: * * vm_paddr_t pmap_preboot_get_pages(u_int num); * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); * vm_offset_t pmap_preboot_reserve_pages(u_int num); * vm_offset_t pmap_preboot_get_vpages(u_int num); * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, * vm_prot_t prot, vm_memattr_t attr); * * (2) for all stages: * * vm_paddr_t pmap_kextract(vm_offset_t va); * * NOTE: This is not SMP coherent stage. * *****************************************************************************/ #define KERNEL_P2V(pa) \ ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) #define KERNEL_V2P(va) \ ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) static vm_paddr_t last_paddr; /* * Pre-bootstrap epoch page allocator. */ vm_paddr_t pmap_preboot_get_pages(u_int num) { vm_paddr_t ret; ret = last_paddr; last_paddr += num * PAGE_SIZE; return (ret); } /* * The fundamental initialization of PMAP stuff. * * Some things already happened in locore.S and some things could happen * before pmap_bootstrap_prepare() is called, so let's recall what is done: * 1. Caches are disabled. * 2. We are running on virtual addresses already with 'boot_pt1' * as L1 page table. * 3. So far, all virtual addresses can be converted to physical ones and * vice versa by the following macros: * KERNEL_P2V(pa) .... physical to virtual ones, * KERNEL_V2P(va) .... virtual to physical ones. * * What is done herein: * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. * 2. PT2MAP magic is brought to live. * 3. Basic preboot functions for page allocations and mappings can be used. * 4. Everything is prepared for L1 cache enabling. * * Variations: * 1. To use second TTB register, so kernel and users page tables will be * separated. This way process forking - pmap_pinit() - could be faster, * it saves physical pages and KVA per a process, and it's simple change. * However, it will lead, due to hardware matter, to the following: * (a) 2G space for kernel and 2G space for users. * (b) 1G space for kernel in low addresses and 3G for users above it. * A question is: Is the case (b) really an option? Note that case (b) * does save neither physical memory and KVA. */ void pmap_bootstrap_prepare(vm_paddr_t last) { vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; vm_offset_t pt2pg_va; pt1_entry_t *pte1p; pt2_entry_t *pte2p; u_int i; uint32_t l1_attr; /* * Now, we are going to make real kernel mapping. Note that we are * already running on some mapping made in locore.S and we expect * that it's large enough to ensure nofault access to physical memory * allocated herein before switch. * * As kernel image and everything needed before are and will be mapped * by section mappings, we align last physical address to PTE1_SIZE. */ last_paddr = pte1_roundup(last); /* * Allocate and zero page(s) for kernel L1 page table. * * Note that it's first allocation on space which was PTE1_SIZE * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. */ base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); bzero((void*)kern_pt1, NB_IN_PT1); pte1_sync_range(kern_pt1, NB_IN_PT1); /* Allocate and zero page(s) for kernel PT2TAB. */ pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); bzero(kern_pt2tab, NB_IN_PT2TAB); pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); /* Allocate and zero page(s) for kernel L2 page tables. */ pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); pt2pg_va = KERNEL_P2V(pt2pg_pa); size = NKPT2PG * PAGE_SIZE; bzero((void*)pt2pg_va, size); pte2_sync_range((pt2_entry_t *)pt2pg_va, size); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated pages for kernel L2 page tables so that vm_page * structures representing these pages will be created. The vm_page * structures are required for promotion of the corresponding kernel * virtual addresses to section mappings. */ vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); /* * Insert allocated L2 page table pages to PT2TAB and make * link to all PT2s in L1 page table. See how kernel_vm_end * is initialized. * * We play simple and safe. So every KVA will have underlaying * L2 page table, even kernel image mapped by sections. */ pte2p = kern_pt2tab_entry(KERNBASE); for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) pt2tab_store(pte2p++, PTE2_KPT(pa)); pte1p = kern_pte1(KERNBASE); for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) pte1_store(pte1p++, PTE1_LINK(pa)); /* Make section mappings for kernel. */ l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); pte1p = kern_pte1(KERNBASE); for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); /* * Get free and aligned space for PT2MAP and make L1 page table links * to L2 page tables held in PT2TAB. * * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus * each entry in PT2TAB maps all PT2s in a page. This implies that * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. */ PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); pte1p = kern_pte1((vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { pte1_store(pte1p++, PTE1_LINK(pa)); } /* * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. * Each pmap will hold own PT2TAB, so the mapping should be not global. */ pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); } /* * Choose correct L2 page table and make mappings for allocations * made herein which replaces temporary locore.S mappings after a while. * Note that PT2MAP cannot be used until we switch to kern_pt1. * * Note, that these allocations started aligned on 1M section and * kernel PT1 was allocated first. Making of mappings must follow * order of physical allocations as we've used KERNEL_P2V() macro * for virtual addresses resolution. */ pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); /* Make mapping for kernel L1 page table. */ for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) pte2_store(pte2p++, PTE2_KPT(pa)); /* Make mapping for kernel PT2TAB. */ for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) pte2_store(pte2p++, PTE2_KPT(pa)); /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ pmap_kern_ttb = base_pt1 | ttb_flags; cpuinfo_reinit_mmu(pmap_kern_ttb); /* * Initialize the first available KVA. As kernel image is mapped by * sections, we are leaving some gap behind. */ virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; } /* * Setup L2 page table page for given KVA. * Used in pre-bootstrap epoch. * * Note that we have allocated NKPT2PG pages for L2 page tables in advance * and used them for mapping KVA starting from KERNBASE. However, this is not * enough. Vectors and devices need L2 page tables too. Note that they are * even above VM_MAX_KERNEL_ADDRESS. */ static __inline vm_paddr_t pmap_preboot_pt2pg_setup(vm_offset_t va) { pt2_entry_t *pte2p, pte2; vm_paddr_t pt2pg_pa; /* Get associated entry in PT2TAB. */ pte2p = kern_pt2tab_entry(va); /* Just return, if PT2s page exists already. */ pte2 = pt2tab_load(pte2p); if (pte2_is_valid(pte2)) return (pte2_pa(pte2)); KASSERT(va >= VM_MAX_KERNEL_ADDRESS, ("%s: NKPT2PG too small", __func__)); /* * Allocate page for PT2s and insert it to PT2TAB. * In other words, map it into PT2MAP space. */ pt2pg_pa = pmap_preboot_get_pages(1); pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); /* Zero all PT2s in allocated page. */ bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); return (pt2pg_pa); } /* * Setup L2 page table for given KVA. * Used in pre-bootstrap epoch. */ static void pmap_preboot_pt2_setup(vm_offset_t va) { pt1_entry_t *pte1p; vm_paddr_t pt2pg_pa, pt2_pa; /* Setup PT2's page. */ pt2pg_pa = pmap_preboot_pt2pg_setup(va); pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); /* Insert PT2 to PT1. */ pte1p = kern_pte1(va); pte1_store(pte1p, PTE1_LINK(pt2_pa)); } /* * Get L2 page entry associated with given KVA. * Used in pre-bootstrap epoch. */ static __inline pt2_entry_t* pmap_preboot_vtopte2(vm_offset_t va) { pt1_entry_t *pte1p; /* Setup PT2 if needed. */ pte1p = kern_pte1(va); if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ pmap_preboot_pt2_setup(va); return (pt2map_entry(va)); } /* * Pre-bootstrap epoch page(s) mapping(s). */ void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) { u_int i; pt2_entry_t *pte2p; /* Map all the pages. */ for (i = 0; i < num; i++) { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, PTE2_KRW(pa)); va += PAGE_SIZE; pa += PAGE_SIZE; } } /* * Pre-bootstrap epoch virtual space alocator. */ vm_offset_t pmap_preboot_reserve_pages(u_int num) { u_int i; vm_offset_t start, va; pt2_entry_t *pte2p; /* Allocate virtual space. */ start = va = virtual_avail; virtual_avail += num * PAGE_SIZE; /* Zero the mapping. */ for (i = 0; i < num; i++) { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, 0); va += PAGE_SIZE; } return (start); } /* * Pre-bootstrap epoch page(s) allocation and mapping(s). */ vm_offset_t pmap_preboot_get_vpages(u_int num) { vm_paddr_t pa; vm_offset_t va; /* Allocate physical page(s). */ pa = pmap_preboot_get_pages(num); /* Allocate virtual space. */ va = virtual_avail; virtual_avail += num * PAGE_SIZE; /* Map and zero all. */ pmap_preboot_map_pages(pa, va, num); bzero((void *)va, num * PAGE_SIZE); return (va); } /* * Pre-bootstrap epoch page mapping(s) with attributes. */ void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, vm_prot_t prot, vm_memattr_t attr) { u_int num; u_int l1_attr, l1_prot, l2_prot, l2_attr; pt1_entry_t *pte1p; pt2_entry_t *pte2p; l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; l2_attr = vm_memattr_to_pte2(attr); l1_prot = ATTR_TO_L1(l2_prot); l1_attr = ATTR_TO_L1(l2_attr); /* Map all the pages. */ num = round_page(size); while (num > 0) { if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { pte1p = kern_pte1(va); pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); va += PTE1_SIZE; pa += PTE1_SIZE; num -= PTE1_SIZE; } else { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); va += PAGE_SIZE; pa += PAGE_SIZE; num -= PAGE_SIZE; } } } /* * Extract from the kernel page table the physical address * that is mapped by the given virtual address "va". */ vm_paddr_t pmap_kextract(vm_offset_t va) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t pte2; pte1 = pte1_load(kern_pte1(va)); if (pte1_is_section(pte1)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); } else if (pte1_is_link(pte1)) { /* * We should beware of concurrent promotion that changes * pte1 at this point. However, it's not a problem as PT2 * page is preserved by promotion in PT2TAB. So even if * it happens, using of PT2MAP is still safe. * * QQQ: However, concurrent removing is a problem which * ends in abort on PT2MAP space. Locking must be used * to deal with this. */ pte2 = pte2_load(pt2map_entry(va)); pa = pte2_pa(pte2) | (va & PTE2_OFFSET); } else { panic("%s: va %#x pte1 %#x", __func__, va, pte1); } return (pa); } /* * Extract from the kernel page table the physical address * that is mapped by the given virtual address "va". Also * return L2 page table entry which maps the address. * * This is only intended to be used for panic dumps. */ vm_paddr_t pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t pte2; pte1 = pte1_load(kern_pte1(va)); if (pte1_is_section(pte1)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; } else if (pte1_is_link(pte1)) { pte2 = pte2_load(pt2map_entry(va)); pa = pte2_pa(pte2); } else { pte2 = 0; pa = 0; } if (pte2p != NULL) *pte2p = pte2; return (pa); } /***************************************************************************** * * PMAP second stage initialization and utility functions * for bootstrap epoch. * * After pmap_bootstrap() is called, the following functions for * mappings can be used: * * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); * void pmap_kremove(vm_offset_t va); * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, * int prot); * * NOTE: This is not SMP coherent stage. And physical page allocation is not * allowed during this stage. * *****************************************************************************/ /* * Initialize kernel PMAP locks and lists, kernel_pmap itself, and * reserve various virtual spaces for temporary mappings. */ void pmap_bootstrap(vm_offset_t firstaddr) { pt2_entry_t *unused __unused; struct pcpu *pc; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ kernel_pmap->pm_pt1 = kern_pt1; kernel_pmap->pm_pt2tab = kern_pt2tab; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) do { \ v = (c)pmap_preboot_reserve_pages(n); \ p = pt2map_entry((vm_offset_t)v); \ } while (0) /* * Local CMAP1/CMAP2 are used for zeroing and copying pages. * Local CMAP2 is also used for data cache cleaning. */ pc = get_pcpu(); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); SYSMAP(vm_offset_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); /* * _tmppt is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, _tmppt, 1); /* * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), * respectively. PADDR3 is used by pmap_pte2_ddb(). */ SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); #ifdef DDB SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); #endif mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); /* * Note that in very short time in initarm(), we are going to * initialize phys_avail[] array and no further page allocation * can happen after that until vm subsystem will be initialized. */ kernel_vm_end_new = kernel_vm_end; virtual_end = vm_max_kernel_address; } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; CPU_FOREACH(i) { pc = pcpu_find(i); /* * Skip if the mapping has already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap1_addr != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("%s: unable to allocate KVA", __func__); pc->pc_cmap1_pte2p = pt2map_entry(pages); pc->pc_cmap2_pte2p = pt2map_entry(pages + PAGE_SIZE); pc->pc_qmap_pte2p = pt2map_entry(pages + (PAGE_SIZE * 2)); pc->pc_cmap1_addr = (caddr_t)pages; pc->pc_cmap2_addr = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * The function can already be use in second initialization stage. * As such, the function DOES NOT call pmap_growkernel() where PT2 * allocation can happen. So if used, be sure that PT2 for given * virtual address is allocated already! * * Add a wired page to the kva. * Note: not SMP coherent. */ static __inline void pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, uint32_t attr) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pte1p = kern_pte1(va); if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ /* * This is a very low level function, so PT2 and particularly * PT2PG associated with given virtual address must be already * allocated. It's a pain mainly during pmap initialization * stage. However, called after pmap initialization with * virtual address not under kernel_vm_end will lead to * the same misery. */ if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) panic("%s: kernel PT2 not allocated!", __func__); } pte2p = pt2map_entry(va); pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); } PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pte1p = kern_pte1(va); if (pte1_is_section(pte1_load(pte1p))) { pte1_clear(pte1p); } else { pte2p = pt2map_entry(va); pte2_clear(pte2p); } } /* * Share new kernel PT2PG with all pmaps. * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) { pmap_t pmap; pt2_entry_t *pte2p; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pte2p = pmap_pt2tab_entry(pmap, va); pt2tab_store(pte2p, npte2); } mtx_unlock_spin(&allpmaps_lock); } /* * Share new kernel PTE1 with all pmaps. * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) { pmap_t pmap; pt1_entry_t *pte1p; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pte1p = pmap_pte1(pmap, va); pte1_store(pte1p, npte1); } mtx_unlock_spin(&allpmaps_lock); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. * * NOTE: Read the comments above pmap_kenter_prot_attr() as * the function is used herein! */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t pte1_offset; pt1_entry_t npte1; uint32_t l1prot, l2prot; uint32_t l1attr, l2attr; PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," " prot = %d\n", __func__, *virt, start, end, end - start, prot)); l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; l1prot = ATTR_TO_L1(l2prot); l2attr = PTE2_ATTR_DEFAULT; l1attr = ATTR_TO_L1(l2attr); va = *virt; /* * Does the physical address range's size and alignment permit at * least one section mapping to be created? */ pte1_offset = start & PTE1_OFFSET; if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= PTE1_SIZE) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of section mappings. */ if ((va & PTE1_OFFSET) < pte1_offset) va = pte1_trunc(va) + pte1_offset; else if ((va & PTE1_OFFSET) > pte1_offset) va = pte1_roundup(va) + pte1_offset; } sva = va; while (start < end) { if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { KASSERT((va & PTE1_OFFSET) == 0, ("%s: misaligned va %#x", __func__, va)); npte1 = PTE1_KERN(start, l1prot, l1attr); pmap_kenter_pte1(va, npte1); va += PTE1_SIZE; start += PTE1_SIZE; } else { pmap_kenter_prot_attr(va, start, l2prot, l2attr); va += PAGE_SIZE; start += PAGE_SIZE; } } tlb_flush_range(sva, va - sva); *virt = va; return (sva); } /* * Make a temporary mapping for a physical address. * This is only intended to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); tlb_flush_local(va); return ((void *)crashdumpmap); } /************************************* * * TLB & cache maintenance routines. * *************************************/ /* * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_tlb_flush(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) tlb_flush(va); } PMAP_INLINE void pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) tlb_flush_range(sva, size); } /* * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PTE2_* bits * are ever set, PTE2_V in particular. * - Assumes we can write to pte2s without pte2_store() atomic ops. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PTE2_V. * - Assumes a vm_offset_t will fit in a pte2 (true for arm). * Because PTE2_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_pte2list_alloc(vm_offset_t *head) { pt2_entry_t *pte2p; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte2p = pt2map_entry(va); *head = *pte2p; if (*head & PTE2_V) panic("%s: va with PTE2_V set!", __func__); *pte2p = 0; return (va); } static void pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) { pt2_entry_t *pte2p; if (va & PTE2_V) panic("%s: freeing va with PTE2_V set!", __func__); pte2p = pt2map_entry(va); *pte2p = *head; /* virtual! PTE2_V is 0 though */ *head = va; } static void pmap_pte2list_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_pte2list_free(head, va); } } /***************************************************************************** * * PMAP third and final stage initialization. * * After pmap_init() is called, PMAP subsystem is fully initialized. * *****************************************************************************/ SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static u_long nkpt2pg = NKPT2PG; SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); static int sp_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &sp_enabled, 0, "Are large page mappings enabled?"); bool pmap_ps_enabled(pmap_t pmap __unused) { return (sp_enabled != 0); } static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "1MB page mapping counters"); static u_long pmap_pte1_demotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pte1_demotions, 0, "1MB page demotions"); static u_long pmap_pte1_mappings; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pte1_mappings, 0, "1MB page mappings"); static u_long pmap_pte1_p_failures; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pte1_p_failures, 0, "1MB page promotion failures"); static u_long pmap_pte1_promotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pte1_promotions, 0, "1MB page promotions"); static u_long pmap_pte1_kern_demotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); static u_long pmap_pte1_kern_promotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); static __inline ttb_entry_t pmap_ttb_get(pmap_t pmap) { return (vtophys(pmap->pm_pt1) | ttb_flags); } /* * Initialize a vm_page's machine-dependent fields. * * Variations: * 1. Pages for L2 page tables are always not managed. So, pv_list and * pt2_wirecount can share same physical space. However, proper * initialization on a page alloc for page tables and reinitialization * on the page free must be ensured. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); pt2_wirecount_init(m); m->md.pat_mode = VM_MEMATTR_DEFAULT; } /* * Virtualization for faster way how to zero whole page. */ static __inline void pagezero(void *page) { bzero(page, PAGE_SIZE); } /* * Zero L2 page table page. * Use same KVA as in pmap_zero_page(). */ static __inline vm_paddr_t pmap_pt2pg_zero(vm_page_t m) { pt2_entry_t *cmap2_pte2p; vm_paddr_t pa; struct pcpu *pc; pa = VM_PAGE_TO_PHYS(m); /* * XXX: For now, we map whole page even if it's already zero, * to sync it even if the sync is only DSB. */ sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, vm_page_pte2_attr(m))); /* Even VM_ALLOC_ZERO request is only advisory. */ if ((m->flags & PG_ZERO) == 0) pagezero(pc->pc_cmap2_addr); pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); /* * Unpin the thread before releasing the lock. Otherwise the thread * could be rescheduled while still bound to the current CPU, only * to unpin itself immediately upon resuming execution. */ sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); return (pa); } /* * Init just allocated page as L2 page table(s) holder * and return its physical address. */ static __inline vm_paddr_t pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) { vm_paddr_t pa; pt2_entry_t *pte2p; /* Check page attributes. */ if (m->md.pat_mode != pt_memattr) pmap_page_set_memattr(m, pt_memattr); /* Zero page and init wire counts. */ pa = pmap_pt2pg_zero(m); pt2_wirecount_init(m); /* * Map page to PT2MAP address space for given pmap. * Note that PT2MAP space is shared with all pmaps. */ if (pmap == kernel_pmap) pmap_kenter_pt2tab(va, PTE2_KPT(pa)); else { pte2p = pmap_pt2tab_entry(pmap, va); pt2tab_store(pte2p, PTE2_KPT_NG(pa)); } return (pa); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; pt2_entry_t *pte2p, pte2; u_int i, pte1_idx, pv_npg; PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); /* * Initialize the vm page array entries for kernel pmap's * L2 page table pages allocated in advance. */ pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { vm_paddr_t pa; vm_page_t m; pte2 = pte2_load(pte2p); KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], ("%s: L2 page table page is out of range", __func__)); m->pindex = pte1_idx; m->phys_addr = pa; pte1_idx += NPT2_IN_PG; } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); if (sp_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("%s: can't assign to pagesizes[1]", __func__)); pagesizes[1] = PTE1_SIZE; } /* * Calculate the size of the pv head table for sections. * Handle the possibility that "vm_phys_segs[...].end" is zero. * Note that the table is only for sections which could be promoted. */ first_managed_pa = pte1_trunc(vm_phys_segs[0].start); pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) - first_managed_pa) / PTE1_SIZE + 1; /* * Allocate memory for the pv head table for sections. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("%s: not enough kvm for pv chunks", __func__); pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { u_int anychanged; pt2_entry_t *epte2p, *pte2p, pte2; vm_page_t m; vm_paddr_t pa; anychanged = 0; pte2p = pt2map_entry(sva); epte2p = pte2p + count; while (pte2p < epte2p) { m = *ma++; pa = VM_PAGE_TO_PHYS(m); pte2 = pte2_load(pte2p); if ((pte2_pa(pte2) != pa) || (pte2_attr(pte2) != vm_page_pte2_attr(m))) { anychanged++; pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, vm_page_pte2_attr(m))); } pte2p++; } if (__predict_false(anychanged)) tlb_flush_range(sva, count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } tlb_flush_range(sva, va - sva); } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); } /* * If the given pmap is not the current or kernel pmap, the returned * pte2 must be released by passing it to pmap_pte2_release(). */ static pt2_entry_t * pmap_pte2(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) panic("%s: attempt to map PTE1", __func__); if (pte1_is_link(pte1)) { /* Are we current address space or kernel? */ if (pmap_is_current(pmap)) return (pt2map_entry(va)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); mtx_lock(&PMAP2mutex); if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); tlb_flush((vm_offset_t)PADDR2); } return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } return (NULL); } /* * Releases a pte2 that was obtained from pmap_pte2(). * Be prepared for the pte2p being NULL. */ static __inline void pmap_pte2_release(pt2_entry_t *pte2p) { if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { mtx_unlock(&PMAP2mutex); } } /* * Super fast pmap_pte2 routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire tlb flush for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt2_entry_t * pmap_pte2_quick(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) panic("%s: attempt to map PTE1", __func__); if (pte1_is_link(pte1)) { /* Are we current address space or kernel? */ if (pmap_is_current(pmap)) return (pt2map_entry(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("%s: curthread not pinned", __func__)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } return (NULL); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t *pte2p; PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) pa = pte1_pa(pte1) | (va & PTE1_OFFSET); else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, va); pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); pmap_pte2_release(pte2p); } else pa = 0; PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t pte2, *pte2p; vm_page_t m; m = NULL; PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) { if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); m = PHYS_TO_VM_PAGE(pa); if (!vm_page_wire_mapped(m)) m = NULL; } } else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, va); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); if (pte2_is_valid(pte2) && (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } /* * Grow the number of kernel L2 page table entries, if needed. */ void pmap_growkernel(vm_offset_t addr) { vm_page_t m; vm_paddr_t pt2pg_pa, pt2_pa; pt1_entry_t pte1; pt2_entry_t pte2; PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); /* * All the time kernel_vm_end is first KVA for which underlying * L2 page table is either not allocated or linked from L1 page table * (not considering sections). Except for two possible cases: * * (1) in the very beginning as long as pmap_growkernel() was * not called, it could be first unused KVA (which is not * rounded up to PTE1_SIZE), * * (2) when all KVA space is mapped and vm_map_max(kernel_map) * address is not rounded up to PTE1_SIZE. (For example, * it could be 0xFFFFFFFF.) */ kernel_vm_end = pte1_roundup(kernel_vm_end); mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, PTE1_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { pte1 = pte1_load(kern_pte1(kernel_vm_end)); if (pte1_is_valid(pte1)) { kernel_vm_end += PTE1_SIZE; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } /* * kernel_vm_end_new is used in pmap_pinit() when kernel * mappings are entered to new pmap all at once to avoid race * between pmap_kenter_pte1() and kernel_vm_end increase. * The same aplies to pmap_kenter_pt2tab(). */ kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); if (!pte2_is_valid(pte2)) { /* * Install new PT2s page into kernel PT2TAB. */ - m = vm_page_alloc(NULL, - pte1_index(kernel_vm_end) & ~PT2PG_MASK, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | + m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) panic("%s: no memory to grow kernel", __func__); + m->pindex = pte1_index(kernel_vm_end) & ~PT2PG_MASK; + /* * QQQ: To link all new L2 page tables from L1 page * table now and so pmap_kenter_pte1() them * at once together with pmap_kenter_pt2tab() * could be nice speed up. However, * pmap_growkernel() does not happen so often... * QQQ: The other TTBR is another option. */ pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, m); } else pt2pg_pa = pte2_pa(pte2); pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); kernel_vm_end = kernel_vm_end_new; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = vm_max_kernel_address - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = vm_max_kernel_address - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_free, "IU", "Amount of KVM free"); /*********************************************** * * Pmap allocation/deallocation routines. * ***********************************************/ /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); PMAP_LOCK_INIT(pmap); /* * Kernel page table directory and pmap stuff around is already * initialized, we are using it right now and here. So, finish * only PMAP structures initialization for process0 ... * * Since the L1 page table and PT2TAB is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pt1 = kern_pt1; pmap->pm_pt2tab = kern_pt2tab; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); CPU_SET(0, &pmap->pm_active); } static __inline void pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, vm_offset_t eva) { u_int idx, count; idx = pte1_index(sva); count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); bcopy(spte1p + idx, dpte1p + idx, count); } static __inline void pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, vm_offset_t eva) { u_int idx, count; idx = pt2tab_index(sva); count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); bcopy(spte2p + idx, dpte2p + idx, count); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; vm_paddr_t pa, pt2tab_pa; u_int i; PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, pmap->pm_pt1)); /* * No need to allocate L2 page table space yet but we do need * a valid L1 page table and PT2TAB table. * * Install shared kernel mappings to these tables. It's a little * tricky as some parts of KVA are reserved for vectors, devices, * and whatever else. These parts are supposed to be above * vm_max_kernel_address. Thus two regions should be installed: * * (1) . * * QQQ: The second region should be stable enough to be installed * only once in time when the tables are allocated. * QQQ: Maybe copy of both regions at once could be faster ... * QQQ: Maybe the other TTBR is an option. * * Finally, install own PT2TAB table to these tables. */ if (pmap->pm_pt1 == NULL) { pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(NB_IN_PT1, M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, pt_memattr); if (pmap->pm_pt1 == NULL) return (0); } if (pmap->pm_pt2tab == NULL) { /* * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page * only, what should be the only size for 32 bit systems, * then we could allocate it with vm_page_alloc() and all * the stuff needed as other L2 page table pages. * (2) Note that a process PT2TAB is special L2 page table * page. Its mapping in kernel_arena is permanent and can * be used no matter which process is current. Its mapping * in PT2MAP can be used only for current process. */ pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(NB_IN_PT2TAB, M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); if (pmap->pm_pt2tab == NULL) { /* * QQQ: As struct pmap is allocated from UMA with * UMA_ZONE_NOFREE flag, it's important to leave * no allocation in pmap if initialization failed. */ kmem_free((vm_offset_t)pmap->pm_pt1, NB_IN_PT1); pmap->pm_pt1 = NULL; return (0); } /* * QQQ: Each L2 page table page vm_page_t has pindex set to * pte1 index of virtual address mapped by this page. * It's not valid for non kernel PT2TABs themselves. * The pindex of these pages can not be altered because * of the way how they are allocated now. However, it * should not be a problem. */ } mtx_lock_spin(&allpmaps_lock); /* * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), * kernel_vm_end_new is used here instead of kernel_vm_end. */ pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, kernel_vm_end_new - 1); pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 0xFFFFFFFF); pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, kernel_vm_end_new - 1); pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 0xFFFFFFFF); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. * I.e. self reference mapping. The PT2TAB is private, however mapped * into shared PT2MAP space, so the mapping should be not global. */ pt2tab_pa = vtophys(pmap->pm_pt2tab); pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); } /* Insert PT2MAP PT2s into pmap PT1. */ pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { pte1_store(pte1p++, PTE1_LINK(pa)); } /* * Now synchronize new mapping which was made above. */ pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } #ifdef INVARIANTS static boolean_t pt2tab_user_is_empty(pt2_entry_t *tab) { u_int i, end; end = pt2tab_index(VM_MAXUSER_ADDRESS); for (i = 0; i < end; i++) if (tab[i] != 0) return (FALSE); return (TRUE); } #endif /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { #ifdef INVARIANTS vm_offset_t start, end; #endif KASSERT(pmap->pm_stats.resident_count == 0, ("%s: pmap resident count %ld != 0", __func__, pmap->pm_stats.resident_count)); KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), ("%s: has allocated user PT2(s)", __func__)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); #ifdef INVARIANTS start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); bzero((char *)pmap->pm_pt1 + start, end - start); start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); bzero((char *)pmap->pm_pt2tab + start, end - start); #endif /* * We are leaving PT1 and PT2TAB allocated on released pmap, * so hopefully UMA vmspace_zone will always be inited with * UMA_ZONE_NOFREE flag. */ } /********************************************************* * * L2 table pages and their pages management routines. * *********************************************************/ /* * Virtual interface for L2 page table wire counting. * * Each L2 page table in a page has own counter which counts a number of * valid mappings in a table. Global page counter counts mappings in all * tables in a page plus a single itself mapping in PT2TAB. * * During a promotion we leave the associated L2 page table counter * untouched, so the table (strictly speaking a page which holds it) * is never freed if promoted. * * If a page m->ref_count == 1 then no valid mappings exist in any L2 page * table in the page and the page itself is only mapped in PT2TAB. */ static __inline void pt2_wirecount_init(vm_page_t m) { u_int i; /* * Note: A page m is allocated with VM_ALLOC_WIRED flag and * m->ref_count should be already set correctly. * So, there is no need to set it again herein. */ for (i = 0; i < NPT2_IN_PG; i++) m->md.pt2_wirecount[i] = 0; } static __inline void pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) { /* * Note: A just modificated pte2 (i.e. already allocated) * is acquiring one extra reference which must be * explicitly cleared. It influences the KASSERTs herein. * All L2 page tables in a page always belong to the same * pmap, so we allow only one extra reference for the page. */ KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), ("%s: PT2 is overflowing ...", __func__)); KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), ("%s: PT2PG is overflowing ...", __func__)); m->ref_count++; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; } static __inline void pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) { KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, ("%s: PT2 is underflowing ...", __func__)); KASSERT(m->ref_count > 1, ("%s: PT2PG is underflowing ...", __func__)); m->ref_count--; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; } static __inline void pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) { KASSERT(count <= NPTE2_IN_PT2, ("%s: invalid count %u", __func__, count)); KASSERT(m->ref_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->ref_count, m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); m->ref_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; m->ref_count += count; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), ("%s: PT2PG is overflowed (%u) ...", __func__, m->ref_count)); } static __inline uint32_t pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) { return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); } static __inline boolean_t pt2_is_empty(vm_page_t m, vm_offset_t va) { return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); } static __inline boolean_t pt2_is_full(vm_page_t m, vm_offset_t va) { return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == NPTE2_IN_PT2); } static __inline boolean_t pt2pg_is_empty(vm_page_t m) { return (m->ref_count == 1); } /* * This routine is called if the L2 page table * is not mapped correctly. */ static vm_page_t _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) { uint32_t pte1_idx; pt1_entry_t *pte1p; pt2_entry_t pte2; vm_page_t m; vm_paddr_t pt2pg_pa, pt2_pa; pte1_idx = pte1_index(va); pte1p = pmap->pm_pt1 + pte1_idx; KASSERT(pte1_load(pte1p) == 0, ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, pte1_load(pte1p))); pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); if (!pte2_is_valid(pte2)) { /* * Install new PT2s page into pmap PT2TAB. */ - m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK, - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); vm_wait(NULL); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, * the L2 page table page may have been allocated. */ return (NULL); } + m->pindex = pte1_idx & ~PT2PG_MASK; pmap->pm_stats.resident_count++; pt2pg_pa = pmap_pt2pg_init(pmap, va, m); } else { pt2pg_pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pt2pg_pa); } pt2_wirecount_inc(m, pte1_idx); pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); pte1_store(pte1p, PTE1_LINK(pt2_pa)); return (m); } static vm_page_t pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) { u_int pte1_idx; pt1_entry_t *pte1p, pte1; vm_page_t m; pte1_idx = pte1_index(va); retry: pte1p = pmap->pm_pt1 + pte1_idx; pte1 = pte1_load(pte1p); /* * This supports switching from a 1MB page to a * normal 4K page. */ if (pte1_is_section(pte1)) { (void)pmap_demote_pte1(pmap, pte1p, va); /* * Reload pte1 after demotion. * * Note: Demotion can even fail as either PT2 is not find for * the virtual address or PT2PG can not be allocated. */ pte1 = pte1_load(pte1p); } /* * If the L2 page table page is mapped, we just increment the * hold count, and activate it. */ if (pte1_is_link(pte1)) { m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); pt2_wirecount_inc(m, pte1_idx); } else { /* * Here if the PT2 isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte2(pmap, va, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /* * Schedule the specified unused L2 page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) { /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ #ifdef PMAP_DEBUG pmap_zero_page_check(m); #endif m->flags |= PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Unwire L2 page tables page. */ static void pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) { pt1_entry_t *pte1p, opte1 __unused; pt2_entry_t *pte2p; uint32_t i; KASSERT(pt2pg_is_empty(m), ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); /* * Unmap all L2 page tables in the page from L1 page table. * * QQQ: Individual L2 page tables (except the last one) can be unmapped * earlier. However, we are doing that this way. */ KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); pte1p = pmap->pm_pt1 + m->pindex; for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { KASSERT(m->md.pt2_wirecount[i] == 0, ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); opte1 = pte1_load(pte1p); if (pte1_is_link(opte1)) { pte1_clear(pte1p); /* * Flush intermediate TLB cache. */ pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); } #ifdef INVARIANTS else KASSERT((opte1 == 0) || pte1_is_section(opte1), ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, pmap, va, opte1, i)); #endif } /* * Unmap the page from PT2TAB. */ pte2p = pmap_pt2tab_entry(pmap, va); (void)pt2tab_load_clear(pte2p); pmap_tlb_flush(pmap, pt2map_pt2pg(va)); m->ref_count = 0; pmap->pm_stats.resident_count--; /* * This barrier is so that the ordinary store unmapping * the L2 page table page is globally performed before TLB shoot- * down is begun. */ wmb(); vm_wire_sub(1); } /* * Decrements a L2 page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static __inline boolean_t pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { pt2_wirecount_dec(m, pte1_index(va)); if (pt2pg_is_empty(m)) { /* * QQQ: Wire count is zero, so whole page should be zero and * we can set PG_ZERO flag to it. * Note that when promotion is enabled, it takes some * more efforts. See pmap_unwire_pt2_all() below. */ pmap_unwire_pt2pg(pmap, va, m); pmap_add_delayed_free_list(m, free); return (TRUE); } else return (FALSE); } /* * Drop a L2 page table page's wire count at once, which is used to record * the number of valid L2 page table entries within the page. If the wire * count drops to zero, then the L2 page table page is unmapped. */ static __inline void pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { u_int pte1_idx = pte1_index(va); KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), ("%s: PT2 page's pindex is wrong", __func__)); KASSERT(m->ref_count > pt2_wirecount_get(m, pte1_idx), ("%s: bad pt2 wire count %u > %u", __func__, m->ref_count, pt2_wirecount_get(m, pte1_idx))); /* * It's possible that the L2 page table was never used. * It happened in case that a section was created without promotion. */ if (pt2_is_full(m, va)) { pt2_wirecount_set(m, pte1_idx, 0); /* * QQQ: We clear L2 page table now, so when L2 page table page * is going to be freed, we can set it PG_ZERO flag ... * This function is called only on section mappings, so * hopefully it's not to big overload. * * XXX: If pmap is current, existing PT2MAP mapping could be * used for zeroing. */ pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); } #ifdef INVARIANTS else KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", __func__, pt2_wirecount_get(m, pte1_idx))); #endif if (pt2pg_is_empty(m)) { pmap_unwire_pt2pg(pmap, va, m); pmap_add_delayed_free_list(m, free); } } /* * After removing a L2 page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static boolean_t pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt1_entry_t pte1; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (FALSE); pte1 = pte1_load(pmap_pte1(pmap, va)); mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); return (pmap_unwire_pt2(pmap, va, mpte, free)); } /************************************* * * Page management routines. * *************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * Is given page managed? */ static __inline bool is_managed(vm_paddr_t pa) { vm_page_t m; m = PHYS_TO_VM_PAGE(pa); if (m == NULL) return (false); return ((m->oflags & VPO_UNMANAGED) == 0); } static __inline bool pte1_is_managed(pt1_entry_t pte1) { return (is_managed(pte1_pa(pte1))); } static __inline bool pte2_is_managed(pt2_entry_t pte2) { return (is_managed(pte2_pa(pte2))); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pt1_entry_t *pte1p; pmap_t pmap; pt2_entry_t *pte2p, tpte2; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffs(inuse) - 1; pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pte1p = pmap_pte1(pmap, va); if (pte1_is_section(pte1_load(pte1p))) continue; pte2p = pmap_pte2(pmap, va); tpte2 = pte2_load(pte2p); if ((tpte2 & PTE2_W) == 0) tpte2 = pte2_load_clear(pte2p); pmap_pte2_release(pte2p); if ((tpte2 & PTE2_W) != 0) continue; KASSERT(tpte2 != 0, ("pmap_pv_reclaim: pmap %p va %#x zero pte", pmap, va)); pmap_tlb_flush(pmap, va); m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); if (pte2_is_dirty(tpte2)) vm_page_dirty(m); if ((tpte2 & PTE2_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt2(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->ref_count = 1; vm_wire_add(1); } vm_page_free_pages_toq(&free, false); return (m_pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire_noq(m); vm_page_free(m); pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); } /* * Free the pv_entry back to the free list. */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } /* * Get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entries tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffs(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the pte2list "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_pte2list_alloc() completes. */ - if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + if (pv_vafree == 0 || + (m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } static void pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PTE1_OFFSET) == 0, ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); /* * Transfer the 1mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = pte1_trunc(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ va_last = va + PTE1_SIZE - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pte1: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PTE1_OFFSET) == 0, ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 1mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = pte1_trunc(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ va_last = va + PTE1_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a section. */ static bool pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags) { struct md_page *pvh; pv_entry_t pv; bool noreclaim; rw_assert(&pvh_global_lock, RA_WLOCKED); noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; if ((noreclaim && pv_entry_count >= pv_entry_high_water) || (pv = get_pv_entry(pmap, noreclaim)) == NULL) return (false); pv->pv_va = va; pvh = pa_to_pvh(pte1_pa(pte1)); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (true); } static inline void pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) { /* Kill all the small mappings or the big one only. */ if (pte1_is_section(npte1)) pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); else pmap_tlb_flush(pmap, pte1_trunc(va)); } /* * Update kernel pte1 on all pmaps. * * The following function is called only on one cpu with disabled interrupts. * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way * nobody can invoke explicit hardware table walk during the update of pte1. * Unsolicited hardware table walk can still happen, invoked by speculative * data or instruction prefetch or even by speculative hardware table walk. * * The break-before-make approach should be implemented here. However, it's * not so easy to do that for kernel mappings as it would be unhappy to unmap * itself unexpectedly but voluntarily. */ static void pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) { pmap_t pmap; pt1_entry_t *pte1p; /* * Get current pmap. Interrupts should be disabled here * so PCPU_GET() is done atomically. */ pmap = PCPU_GET(curpmap); if (pmap == NULL) pmap = kernel_pmap; /* * (1) Change pte1 on current pmap. * (2) Flush all obsolete TLB entries on current CPU. * (3) Change pte1 on all pmaps. * (4) Flush all obsolete TLB entries on all CPUs in SMP case. */ pte1p = pmap_pte1(pmap, va); pte1_store(pte1p, npte1); /* Kill all the small mappings or the big one only. */ if (pte1_is_section(npte1)) { pmap_pte1_kern_promotions++; tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); } else { pmap_pte1_kern_demotions++; tlb_flush_local(pte1_trunc(va)); } /* * In SMP case, this function is called when all cpus are at smp * rendezvous, so there is no need to use 'allpmaps_lock' lock here. * In UP case, the function is called with this lock locked. */ LIST_FOREACH(pmap, &allpmaps, pm_list) { pte1p = pmap_pte1(pmap, va); pte1_store(pte1p, npte1); } #ifdef SMP /* Kill all the small mappings or the big one only. */ if (pte1_is_section(npte1)) tlb_flush_range(pte1_trunc(va), PTE1_SIZE); else tlb_flush(pte1_trunc(va)); #endif } #ifdef SMP struct pte1_action { vm_offset_t va; pt1_entry_t npte1; u_int update; /* CPU that updates the PTE1 */ }; static void pmap_update_pte1_action(void *arg) { struct pte1_action *act = arg; if (act->update == PCPU_GET(cpuid)) pmap_update_pte1_kernel(act->va, act->npte1); } /* * Change pte1 on current pmap. * Note that kernel pte1 must be changed on all pmaps. * * According to the architecture reference manual published by ARM, * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. * According to this manual, UNPREDICTABLE behaviours must never happen in * a viable system. In contrast, on x86 processors, it is not specified which * TLB entry mapping the virtual address will be used, but the MMU doesn't * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone * Black). * * It's a problem when either promotion or demotion is being done. The pte1 * update and appropriate TLB flush must be done atomically in general. */ static void pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, pt1_entry_t npte1) { if (pmap == kernel_pmap) { struct pte1_action act; sched_pin(); act.va = va; act.npte1 = npte1; act.update = PCPU_GET(cpuid); smp_rendezvous_cpus(all_cpus, smp_no_rendezvous_barrier, pmap_update_pte1_action, NULL, &act); sched_unpin(); } else { register_t cspr; /* * Use break-before-make approach for changing userland * mappings. It can cause L1 translation aborts on other * cores in SMP case. So, special treatment is implemented * in pmap_fault(). To reduce the likelihood that another core * will be affected by the broken mapping, disable interrupts * until the mapping change is completed. */ cspr = disable_interrupts(PSR_I | PSR_F); pte1_clear(pte1p); pmap_tlb_flush_pte1(pmap, va, npte1); pte1_store(pte1p, npte1); restore_interrupts(cspr); } } #else static void pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, pt1_entry_t npte1) { if (pmap == kernel_pmap) { mtx_lock_spin(&allpmaps_lock); pmap_update_pte1_kernel(va, npte1); mtx_unlock_spin(&allpmaps_lock); } else { register_t cspr; /* * Use break-before-make approach for changing userland * mappings. It's absolutely safe in UP case when interrupts * are disabled. */ cspr = disable_interrupts(PSR_I | PSR_F); pte1_clear(pte1p); pmap_tlb_flush_pte1(pmap, va, npte1); pte1_store(pte1p, npte1); restore_interrupts(cspr); } } #endif #if VM_NRESERVLEVEL > 0 /* * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are * within a single page table page (PT2) to a single 1MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PTE1s are replicated in each pmap but * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only * read the PTE1 from the kernel pmap. */ static void pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { pt1_entry_t npte1; pt2_entry_t *fpte2p, fpte2, fpte2_fav; pt2_entry_t *pte2p, pte2; vm_offset_t pteva __unused; vm_page_t m __unused; PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, pmap, va, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is * either invalid, unused, or does not map the first 4KB physical page * within a 1MB page. */ fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); fpte2 = pte2_load(fpte2p); if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != (PTE2_A | PTE2_V)) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", __func__, va, pmap); return; } if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", __func__, va, pmap); return; } if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { /* * When page is not modified, PTE2_RO can be set without * a TLB invalidation. */ fpte2 |= PTE2_RO; pte2_store(fpte2p, fpte2); } /* * Examine each of the other PTE2s in the specified PT2. Abort if this * PTE2 maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE2. */ fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { pte2 = pte2_load(pte2p); if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", __func__, va, pmap); return; } if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { /* * When page is not modified, PTE2_RO can be set * without a TLB invalidation. See note above. */ pte2 |= PTE2_RO; pte2_store(pte2p, pte2); pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & PTE2_FRAME); CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", __func__, pteva, pmap); } if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", __func__, va, pmap); return; } fpte2_fav -= PTE2_SIZE; } /* * The page table page in its current state will stay in PT2TAB * until the PTE1 mapping the section is demoted by pmap_demote_pte1() * or destroyed by pmap_remove_pte1(). * * Note that L2 page table size is not equal to PAGE_SIZE. */ m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], ("%s: PT2 page is out of range", __func__)); KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), ("%s: PT2 page's pindex is wrong", __func__)); /* * Get pte1 from pte2 format. */ npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; /* * Promote the pv entries. */ if (pte2_is_managed(fpte2)) pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); /* * Promote the mappings. */ pmap_change_pte1(pmap, pte1p, va, npte1); pmap_pte1_promotions++; CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", __func__, va, pmap); PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Zero L2 page table page. */ static __inline void pmap_clear_pt2(pt2_entry_t *fpte2p) { pt2_entry_t *pte2p; for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) pte2_clear(pte2p); } /* * Removes a 1MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { vm_page_t m; uint32_t pte1_idx; pt2_entry_t *fpte2p; vm_paddr_t pt2_pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); m = pmap_pt2_page(pmap, va); if (m == NULL) /* * QQQ: Is this function called only on promoted pte1? * We certainly do section mappings directly * (without promotion) in kernel !!! */ panic("%s: missing pt2 page", __func__); pte1_idx = pte1_index(va); /* * Initialize the L2 page table. */ fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); pmap_clear_pt2(fpte2p); /* * Remove the mapping. */ pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); /* * QQQ: We do not need to invalidate PT2MAP mapping * as we did not change it. I.e. the L2 page table page * was and still is mapped the same way. */ } /* * Do the things to unmap a section in a process */ static void pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, struct spglist *free) { pt1_entry_t opte1; struct md_page *pvh; vm_offset_t eva, va; vm_page_t m; PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PTE1_OFFSET) == 0, ("%s: sva is not 1mpage aligned", __func__)); /* * Clear and invalidate the mapping. It should occupy one and only TLB * entry. So, pmap_tlb_flush() called with aligned address should be * sufficient. */ opte1 = pte1_load_clear(pte1p); pmap_tlb_flush(pmap, sva); if (pte1_is_wired(opte1)) pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; if (pte1_is_managed(opte1)) { pvh = pa_to_pvh(pte1_pa(opte1)); pmap_pvh_free(pvh, pmap, sva); eva = sva + PTE1_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); va < eva; va += PAGE_SIZE, m++) { if (pte1_is_dirty(opte1)) vm_page_dirty(m); if (opte1 & PTE1_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { /* * L2 page table(s) can't be removed from kernel map as * kernel counts on it (stuff around pmap_growkernel()). */ pmap_remove_kernel_pte1(pmap, pte1p, sva); } else { /* * Get associated L2 page table page. * It's possible that the page was never allocated. */ m = pmap_pt2_page(pmap, sva); if (m != NULL) pmap_unwire_pt2_all(pmap, sva, m, free); } } /* * Fills L2 page table page with mappings to consecutive physical pages. */ static __inline void pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) { pt2_entry_t *pte2p; for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { pte2_store(pte2p, npte2); npte2 += PTE2_SIZE; } } /* * Tries to demote a 1MB page mapping. If demotion fails, the * 1MB page mapping is invalidated. */ static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { pt1_entry_t opte1, npte1; pt2_entry_t *fpte2p, npte2; vm_paddr_t pt2pg_pa, pt2_pa; vm_page_t m; struct spglist free; uint32_t pte1_idx, isnew = 0; PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, pmap, va, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); opte1 = pte1_load(pte1p); KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { KASSERT(!pte1_is_wired(opte1), ("%s: PT2 page for a wired mapping is missing", __func__)); /* * Invalidate the 1MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ - if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL, - pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ | - VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { + if ((opte1 & PTE1_A) == 0 || + (m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); vm_page_free_pages_toq(&free, false); CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", __func__, va, pmap); return (FALSE); } + m->pindex = pte1_index(va) & ~PT2PG_MASK; if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; isnew = 1; /* * We init all L2 page tables in the page even if * we are going to change everything for one L2 page * table in a while. */ pt2pg_pa = pmap_pt2pg_init(pmap, va, m); } else { if (va < VM_MAXUSER_ADDRESS) { if (pt2_is_empty(m, va)) isnew = 1; /* Demoting section w/o promotion. */ #ifdef INVARIANTS else KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" " count %u", __func__, pt2_wirecount_get(m, pte1_index(va)))); #endif } } pt2pg_pa = VM_PAGE_TO_PHYS(m); pte1_idx = pte1_index(va); /* * If the pmap is current, then the PT2MAP can provide access to * the page table page (promoted L2 page tables are not unmapped). * Otherwise, temporarily map the L2 page table page (m) into * the kernel's address space at either PADDR1 or PADDR2. * * Note that L2 page table size is not equal to PAGE_SIZE. */ if (pmap_is_current(pmap)) fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); } else { mtx_lock(&PMAP2mutex); if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); tlb_flush((vm_offset_t)PADDR2); } fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); } pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); npte1 = PTE1_LINK(pt2_pa); KASSERT((opte1 & PTE1_A) != 0, ("%s: opte1 is missing PTE1_A", __func__)); KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, ("%s: opte1 has PTE1_NM", __func__)); /* * Get pte2 from pte1 format. */ npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; /* * If the L2 page table page is new, initialize it. If the mapping * has changed attributes, update the page table entries. */ if (isnew != 0) { pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); pmap_fill_pt2(fpte2p, npte2); } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != (npte2 & PTE2_PROMOTE)) pmap_fill_pt2(fpte2p, npte2); KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), ("%s: fpte2p and npte2 map different physical addresses", __func__)); if (fpte2p == PADDR2) mtx_unlock(&PMAP2mutex); /* * Demote the mapping. This pmap is locked. The old PTE1 has * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also * has not PTE1_NM set. Thus, there is no danger of a race with * another processor changing the setting of PTE1_A and/or PTE1_NM * between the read above and the store below. */ pmap_change_pte1(pmap, pte1p, va, npte1); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_pv_reclaim(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PTE1 must have already changed from mapping * the 1mpage to referencing the page table page. */ if (pte1_is_managed(opte1)) pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); pmap_pte1_demotions++; CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", __func__, va, pmap); PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); return (TRUE); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pt2_entry_t npte2, opte2; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte2, om; int rv; va = trunc_page(va); KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, va)); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), ("%s: managed mapping within the clean submap", __func__)); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("%s: flags %u has reserved bits set", __func__, flags)); pa = VM_PAGE_TO_PHYS(m); npte2 = PTE2(pa, PTE2_A, vm_page_pte2_attr(m)); if ((flags & VM_PROT_WRITE) == 0) npte2 |= PTE2_NM; if ((prot & VM_PROT_WRITE) == 0) npte2 |= PTE2_RO; KASSERT((npte2 & (PTE2_NM | PTE2_RO)) != PTE2_RO, ("%s: flags includes VM_PROT_WRITE but prot doesn't", __func__)); if ((prot & VM_PROT_EXECUTE) == 0) npte2 |= PTE2_NX; if ((flags & PMAP_ENTER_WIRED) != 0) npte2 |= PTE2_W; if (va < VM_MAXUSER_ADDRESS) npte2 |= PTE2_U; if (pmap != kernel_pmap) npte2 |= PTE2_NG; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PTE1_OFFSET) == 0, ("%s: va unaligned", __func__)); KASSERT(m->psind > 0, ("%s: m->psind < psind", __func__)); rv = pmap_enter_pte1(pmap, va, PTE1_PA(pa) | ATTR_TO_L1(npte2) | PTE1_V, flags, m); goto out; } /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte2 = pmap_allocpte2(pmap, va, flags); if (mpte2 == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte2 failed with sleep allowed")); rv = KERN_RESOURCE_SHORTAGE; goto out; } } else mpte2 = NULL; pte1p = pmap_pte1(pmap, va); if (pte1_is_section(pte1_load(pte1p))) panic("%s: attempted on 1MB page", __func__); pte2p = pmap_pte2_quick(pmap, va); if (pte2p == NULL) panic("%s: invalid L1 page table entry va=%#x", __func__, va); om = NULL; opte2 = pte2_load(pte2p); opa = pte2_pa(opte2); /* * Mapping has not changed, must be protection or wiring change. */ if (pte2_is_valid(opte2) && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT2 pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT2 page will be also. */ if (pte2_is_wired(npte2) && !pte2_is_wired(opte2)) pmap->pm_stats.wired_count++; else if (!pte2_is_wired(npte2) && pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; /* * Remove extra pte2 reference */ if (mpte2) pt2_wirecount_dec(mpte2, pte1_index(va)); if ((m->oflags & VPO_UNMANAGED) == 0) om = m; goto validate; } /* * QQQ: We think that changing physical address on writeable mapping * is not safe. Well, maybe on kernel address space with correct * locking, it can make a sense. However, we have no idea why * anyone should do that on user address space. Are we wrong? */ KASSERT((opa == 0) || (opa == pa) || !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", __func__, pmap, va, opte2, opa, pa, flags, prot)); pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; om = PHYS_TO_VM_PAGE(opa); if (om != NULL && (om->oflags & VPO_UNMANAGED) != 0) om = NULL; if (om != NULL) pv = pmap_pvh_remove(&om->md, pmap, va); /* * Remove extra pte2 reference */ if (mpte2 != NULL) pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { if (pv == NULL) { pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; } TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (pte2_is_wired(npte2)) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ if (prot & VM_PROT_WRITE) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * If the mapping or permission bits are different, we need * to update the pte2. * * QQQ: Think again and again what to do * if the mapping is going to be changed! */ if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. Do it now, before the mapping is stored and made * valid for hardware table walk. If done later, there is a race * for other threads of current process in lazy loading case. * Don't do it for kernel memory which is mapped with exec * permission even if the memory isn't going to hold executable * code. The only time when icache sync is needed is after * kernel module is loaded and the relocation info is processed. * And it's done in elf_cpu_load_file(). * * QQQ: (1) Does it exist any better way where * or how to sync icache? * (2) Now, we do it on a page basis. */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pat_mode == VM_MEMATTR_WB_WA && (opa != pa || (opte2 & PTE2_NX))) cache_icache_sync_fresh(va, pa, PAGE_SIZE); if (opte2 & PTE2_V) { /* Change mapping with break-before-make approach. */ opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, va); pte2_store(pte2p, npte2); if (om != NULL) { KASSERT((om->oflags & VPO_UNMANAGED) == 0, ("%s: om %p unmanaged", __func__, om)); if ((opte2 & PTE2_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); if (pte2_is_dirty(opte2)) vm_page_dirty(om); if (TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } } else pte2_store(pte2p, npte2); } #if 0 else { /* * QQQ: In time when both access and not mofified bits are * emulated by software, this should not happen. Some * analysis is need, if this really happen. Missing * tlb flush somewhere could be the reason. */ panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, va, opte2, npte2); } #endif #if VM_NRESERVLEVEL > 0 /* * If both the L2 page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pte1(pmap, pte1p, va); #endif rv = KERN_SUCCESS; out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (rv); } /* * Do the things to unmap a page in a process. */ static int pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, struct spglist *free) { pt2_entry_t opte2; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Clear and invalidate the mapping. */ opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, va); KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", __func__, pmap, va, opte2)); if (opte2 & PTE2_W) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pte2_is_managed(opte2)) { m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); if (pte2_is_dirty(opte2)) vm_page_dirty(m); if (opte2 & PTE2_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt2(pmap, va, free)); } /* * Remove a single page from a process address space. */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt2_entry_t *pte2p; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("%s: curthread not pinned", __func__)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || !pte2_is_valid(pte2_load(pte2p))) return; pmap_remove_pte2(pmap, pte2p, va, free); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * Special handling of removing one page. A very common * operation and easy to short circuit some code. */ if (sva + PAGE_SIZE == eva) { pte1 = pte1_load(pmap_pte1(pmap, sva)); if (pte1_is_link(pte1)) { pmap_remove_page(pmap, sva, &free); goto out; } } for (; sva < eva; sva = nextva) { /* * Calculate address for next L2 page table. */ nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; if (pmap->pm_stats.resident_count == 0) break; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that the L1 page * table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pmap_remove_pte1(pmap, pte1p, sva, &free); continue; } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* The large page mapping was destroyed. */ continue; } #ifdef INVARIANTS else { /* Update pte1 after demotion. */ pte1 = pte1_load(pte1p); } #endif } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being removed. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) continue; if (pmap_remove_pte2(pmap, pte2p, sva, &free)) break; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt2_entry_t *pte2p, opte2; pt1_entry_t *pte1p; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); (void)pmap_demote_pte1(pmap, pte1p, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " "a 1mpage in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, pv->pv_va); KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", __func__, pmap, pv->pv_va)); if (pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; if (opte2 & PTE2_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pte2_is_dirty(opte2)) vm_page_dirty(m); pmap_unuse_pt2(pmap, pv->pv_va, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); } /* * Just subroutine for pmap_remove_pages() to reasonably satisfy * good coding style, a.k.a. 80 character line width limit hell. */ static __inline void pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, struct spglist *free) { vm_paddr_t pa; vm_page_t m, mt, mpt2pg; struct md_page *pvh; pa = pte1_pa(pte1); m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", __func__, m, m->phys_addr, pa)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("%s: bad pte1 %#x", __func__, pte1)); if (pte1_is_dirty(pte1)) { for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; pvh = pa_to_pvh(pa); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpt2pg = pmap_pt2_page(pmap, pv->pv_va); if (mpt2pg != NULL) pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); } /* * Just subroutine for pmap_remove_pages() to reasonably satisfy * good coding style, a.k.a. 80 character line width limit hell. */ static __inline void pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, struct spglist *free) { vm_paddr_t pa; vm_page_t m; struct md_page *pvh; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", __func__, m, m->phys_addr, pa)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("%s: bad pte2 %#x", __func__, pte2)); if (pte2_is_dirty(pte2)) vm_page_dirty(m); pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(pa); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt2(pmap, pv->pv_va, free); } /* * Remove all pages from specified address space this aids process * exit speeds. Also, this code is special cased for current process * only, but can have the more generic (and slightly slower) mode enabled. * This is much faster than pmap_remove in the case of running down * an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; pv_entry_t pv; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; boolean_t allfree; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), ("%s: non-current pmap %p", __func__, pmap)); #if defined(SMP) && defined(INVARIANTS) { cpuset_t other_cpus; sched_pin(); other_cpus = pmap->pm_active; CPU_CLR(PCPU_GET(cpuid), &other_cpus); sched_unpin(); KASSERT(CPU_EMPTY(&other_cpus), ("%s: pmap %p active on other cpus", __func__, pmap)); } #endif SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", __func__, pmap, pc->pc_pmap)); allfree = TRUE; for (field = 0; field < _NPCM; field++) { inuse = (~(pc->pc_map[field])) & pc_freemask[field]; while (inuse != 0) { bit = ffs(inuse) - 1; bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; /* * Note that we cannot remove wired pages * from a process' mapping at this time */ pte1p = pmap_pte1(pmap, pv->pv_va); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { if (pte1_is_wired(pte1)) { allfree = FALSE; continue; } pte1_clear(pte1p); pmap_remove_pte1_quick(pmap, pte1, pv, &free); } else if (pte1_is_link(pte1)) { pte2p = pt2map_entry(pv->pv_va); pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) { printf("%s: pmap %p va %#x " "pte2 %#x\n", __func__, pmap, pv->pv_va, pte2); panic("bad pte2"); } if (pte2_is_wired(pte2)) { allfree = FALSE; continue; } pte2_clear(pte2p); pmap_remove_pte2_quick(pmap, pte2, pv, &free); } else { printf("%s: pmap %p va %#x pte1 %#x\n", __func__, pmap, pv->pv_va, pte1); panic("bad pte1"); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } tlb_flush_all_ng_local(); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } /* * This code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No L2 page table pages. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpt2pg) { pt2_entry_t *pte2p, pte2; vm_paddr_t pa; struct spglist free; uint32_t l2prot; KASSERT(!VA_IS_CLEANMAP(va) || (m->oflags & VPO_UNMANAGED) != 0, ("%s: managed mapping within the clean submap", __func__)); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a L2 page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int pte1_idx; pt1_entry_t pte1, *pte1p; vm_paddr_t pt2_pa; /* * Get L1 page table things. */ pte1_idx = pte1_index(va); pte1p = pmap_pte1(pmap, va); pte1 = pte1_load(pte1p); if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { /* * Each of NPT2_IN_PG L2 page tables on the page can * come here. Make sure that associated L1 page table * link is established. * * QQQ: It comes that we don't establish all links to * L2 page tables for newly allocated L2 page * tables page. */ KASSERT(!pte1_is_section(pte1), ("%s: pte1 %#x is section", __func__, pte1)); if (!pte1_is_link(pte1)) { pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), pte1_idx); pte1_store(pte1p, PTE1_LINK(pt2_pa)); } pt2_wirecount_inc(mpt2pg, pte1_idx); } else { /* * If the L2 page table page is mapped, we just * increment the hold count, and activate it. */ if (pte1_is_section(pte1)) { return (NULL); } else if (pte1_is_link(pte1)) { mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); pt2_wirecount_inc(mpt2pg, pte1_idx); } else { mpt2pg = _pmap_allocpte2(pmap, va, PMAP_ENTER_NOSLEEP); if (mpt2pg == NULL) return (NULL); } } } else { mpt2pg = NULL; } /* * This call to pt2map_entry() makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte2_quick(). * But that isn't as quick as pt2map_entry(). */ pte2p = pt2map_entry(va); pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2)) { if (mpt2pg != NULL) { /* * Remove extra pte2 reference */ pt2_wirecount_dec(mpt2pg, pte1_index(va)); mpt2pg = NULL; } return (NULL); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpt2pg != NULL) { SLIST_INIT(&free); if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { pmap_tlb_flush(pmap, va); vm_page_free_pages_toq(&free, false); } mpt2pg = NULL; } return (NULL); } /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ pa = VM_PAGE_TO_PHYS(m); l2prot = PTE2_RO | PTE2_NM; if (va < VM_MAXUSER_ADDRESS) l2prot |= PTE2_U | PTE2_NG; if ((prot & VM_PROT_EXECUTE) == 0) l2prot |= PTE2_NX; else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. QQQ: For more info, see comments in pmap_enter(). */ cache_icache_sync_fresh(va, pa, PAGE_SIZE); } pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); return (mpt2pg); } void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Tries to create a read- and/or execute-only 1 MB page mapping. Returns * true if successful. Returns false if (1) a mapping already exists at the * specified virtual address or (2) a PV entry cannot be allocated without * reclaiming another PV entry. */ static bool pmap_enter_1mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pt1_entry_t pte1; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pa = VM_PAGE_TO_PHYS(m); pte1 = PTE1(pa, PTE1_NM | PTE1_RO, ATTR_TO_L1(vm_page_pte2_attr(m))); if ((prot & VM_PROT_EXECUTE) == 0) pte1 |= PTE1_NX; if (va < VM_MAXUSER_ADDRESS) pte1 |= PTE1_U; if (pmap != kernel_pmap) pte1 |= PTE1_NG; return (pmap_enter_pte1(pmap, va, pte1, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m) == KERN_SUCCESS); } /* * Tries to create the specified 1 MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NORECLAIM was specified and PV entry * allocation failed. */ static int pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags, vm_page_t m) { struct spglist free; pt1_entry_t opte1, *pte1p; pt2_entry_t pte2, *pte2p; vm_offset_t cur, end; vm_page_t mt; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pte1 & (PTE1_NM | PTE1_RO)) == 0 || (pte1 & (PTE1_NM | PTE1_RO)) == (PTE1_NM | PTE1_RO), ("%s: pte1 has inconsistent NM and RO attributes", __func__)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pte1p = pmap_pte1(pmap, va); opte1 = pte1_load(pte1p); if (pte1_is_valid(opte1)) { if ((flags & PMAP_ENTER_NOREPLACE) != 0) { CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if (pte1_is_section(opte1)) { /* * If the section resulted from a promotion, then a * reserved PT page could be freed. */ pmap_remove_pte1(pmap, pte1p, va, &free); } else { sched_pin(); end = va + PTE1_SIZE; for (cur = va, pte2p = pmap_pte2_quick(pmap, va); cur != end; cur += PAGE_SIZE, pte2p++) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) continue; if (pmap_remove_pte2(pmap, pte2p, cur, &free)) break; } sched_unpin(); } vm_page_free_pages_toq(&free, false); } if ((m->oflags & VPO_UNMANAGED) == 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pte1(pmap, va, pte1, flags)) { CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((pte1 & PTE1_RO) == 0) { for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if (pte1_is_wired(pte1)) pmap->pm_stats.wired_count += PTE1_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. QQQ: For more info, see comments in pmap_enter(). */ if ((pte1 & PTE1_NX) == 0 && m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap && (!pte1_is_section(opte1) || pte1_pa(opte1) != VM_PAGE_TO_PHYS(m) || (opte1 & PTE2_NX) != 0)) cache_icache_sync_fresh(va, VM_PAGE_TO_PHYS(m), PTE1_SIZE); /* * Map the section. */ pte1_store(pte1p, pte1); pmap_pte1_mappings++; CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpt2pg; vm_pindex_t diff, psize; PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", __func__, pmap, start, end, m_start, prot)); VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpt2pg = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && m->psind == 1 && sp_enabled && pmap_enter_1mpage(pmap, va, m, prot)) m = &m[PTE1_SIZE / PAGE_SIZE - 1]; else mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, mpt2pg); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pt1_entry_t *pte1p; vm_paddr_t pa, pte2_pa; vm_page_t p; vm_memattr_t pat_mode; u_int l1attr, l1prot; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("%s: non-device object", __func__)); if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("%s: invalid page %p", __func__, p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 1MB page boundary. */ pte2_pa = VM_PAGE_TO_PHYS(p); if (pte2_pa & PTE1_OFFSET) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("%s: invalid page %p", __func__, p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 1MB pages. * * QQQ: Well, we are mapping a section, so same condition must * be hold like during promotion. It looks that only RW mapping * is done here, so readonly mapping must be done elsewhere. */ l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); PMAP_LOCK(pmap); for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { pte1p = pmap_pte1(pmap, addr); if (!pte1_is_valid(pte1_load(pte1p))) { pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; pmap_pte1_mappings++; } /* Else continue on if the PTE1 is already valid. */ addr += PTE1_SIZE; } PMAP_UNLOCK(pmap); } } /* * Do the things to protect a 1mpage in a process. */ static void pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, vm_prot_t prot) { pt1_entry_t npte1, opte1; vm_offset_t eva, va; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PTE1_OFFSET) == 0, ("%s: sva is not 1mpage aligned", __func__)); opte1 = npte1 = pte1_load(pte1p); if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { eva = sva + PTE1_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); va < eva; va += PAGE_SIZE, m++) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) npte1 |= PTE1_RO | PTE1_NM; if ((prot & VM_PROT_EXECUTE) == 0) npte1 |= PTE1_NX; /* * QQQ: Herein, execute permission is never set. * It only can be cleared. So, no icache * syncing is needed. */ if (npte1 != opte1) { pte1_store(pte1p, npte1); pmap_tlb_flush(pmap, sva); } } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { boolean_t pv_lists_locked; vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, opte2, npte2; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = nextva) { /* * Calculate address for next L2 page table. */ nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that L1 page * page table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pmap_protect_pte1(pmap, pte1p, sva, prot); continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* * The large page mapping * was destroyed. */ continue; } #ifdef INVARIANTS else { /* Update pte1 after demotion */ pte1 = pte1_load(pte1p); } #endif } } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being protected. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { vm_page_t m; opte2 = npte2 = pte2_load(pte2p); if (!pte2_is_valid(opte2)) continue; if ((prot & VM_PROT_WRITE) == 0) { if (pte2_is_managed(opte2) && pte2_is_dirty(opte2)) { m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); vm_page_dirty(m); } npte2 |= PTE2_RO | PTE2_NM; } if ((prot & VM_PROT_EXECUTE) == 0) npte2 |= PTE2_NX; /* * QQQ: Herein, execute permission is never set. * It only can be cleared. So, no icache * syncing is needed. */ if (npte2 != opte2) { pte2_store(pte2p, npte2); pmap_tlb_flush(pmap, sva); } } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt1_entry_t pte1; pt2_entry_t pte2; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { if (pte1_is_wired(pte1)) count++; } else { KASSERT(pte1_is_link(pte1), ("%s: pte1 %#x is not link", __func__, pte1)); pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); if (pte2_is_wired(pte2)) count++; } PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 1mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt1_entry_t pte1; pt2_entry_t pte2; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { rv = pte1_is_dirty(pte1); } else { KASSERT(pte1_is_link(pte1), ("%s: pte1 %#x is not link", __func__, pte1)); pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); rv = pte2_is_dirty(pte2); } PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt1_entry_t pte1; pt2_entry_t pte2; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, addr)); if (pte1_is_link(pte1)) { pte2 = pte2_load(pt2map_entry(addr)); rv = !pte2_is_valid(pte2) ; } PMAP_UNLOCK(pmap); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 1mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt1_entry_t pte1; pt2_entry_t pte2; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); } else { pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); } PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, opte2; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); opte1 = pte1_load(pte1p); if (pte1_is_dirty(opte1)) { /* * Although "opte1" is mapping a 1MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((opte1 & PTE1_A) != 0) { /* * Since this reference bit is shared by 256 4KB pages, * it should not be cleared every time it is tested. * Apply a simple "hash" function on the physical page * number, the virtual section number, and the pmap * address to select one 4KB page out of the 256 * on which testing the reference bit will result * in clearing that bit. This function is designed * to avoid the selection of the same 4KB page * for every 1MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the section is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && !pte1_is_wired(opte1)) { pte1_clear_bit(pte1p, PTE1_A); pmap_tlb_flush(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(pte1_is_link(pte1_load(pte1p)), ("%s: not found a link in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); opte2 = pte2_load(pte2p); if (pte2_is_dirty(opte2)) vm_page_dirty(m); if ((opte2 & PTE2_A) != 0) { pte2_clear_bit(pte2p, PTE2_A); pmap_tlb_flush(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = nextva) { nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that L1 page * page table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { if (!pte1_is_wired(pte1)) panic("%s: pte1 %#x not wired", __func__, pte1); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pte1_clear_bit(pte1p, PTE1_W); pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) panic("%s: demotion failed", __func__); #ifdef INVARIANTS else { /* Update pte1 after demotion */ pte1 = pte1_load(pte1p); } #endif } } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being protected. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) continue; if (!pte2_is_wired(pte2)) panic("%s: pte2 %#x is missing PTE2_W", __func__, pte2); /* * PTE2_W must be cleared atomically. Although the pmap * lock synchronizes access to PTE2_W, another processor * could be changing PTE2_NM and/or PTE2_A concurrently. */ pte2_clear_bit(pte2p, PTE2_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pt1_entry_t *pte1p; pt2_entry_t *pte2p, opte2; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); if (!(pte1_load(pte1p) & PTE1_RO)) (void)pmap_demote_pte1(pmap, pte1p, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" " a section in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); opte2 = pte2_load(pte2p); if (!(opte2 & PTE2_RO)) { pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); if (pte2_is_dirty(opte2)) vm_page_dirty(m); pmap_tlb_flush(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, pte2; vm_offset_t pdnxt; vm_page_t m; boolean_t pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = pte1_trunc(sva + PTE1_SIZE); if (pdnxt < sva) pdnxt = eva; pte1p = pmap_pte1(pmap, sva); opte1 = pte1_load(pte1p); if (!pte1_is_valid(opte1)) /* XXX */ continue; else if (pte1_is_section(opte1)) { if (!pte1_is_managed(opte1)) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying L2 page * table is fully populated, this removal never * frees a L2 page table page. */ if (!pte1_is_wired(opte1)) { pte2p = pmap_pte2_quick(pmap, sva); KASSERT(pte2_is_valid(pte2_load(pte2p)), ("%s: invalid PTE2", __func__)); pmap_remove_pte2(pmap, pte2p, sva, NULL); } } if (pdnxt > eva) pdnxt = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) continue; else if (pte2_is_dirty(pte2)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); vm_page_dirty(m); } pte2_set_bit(pte2p, PTE2_NM); pte2_clear_bit(pte2p, PTE2_A); } else if ((pte2 & PTE2_A) != 0) pte2_clear_bit(pte2p, PTE2_A); else continue; pmap_tlb_flush(pmap, sva); } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, opte2; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); opte1 = pte1_load(pte1p); if (!(opte1 & PTE1_RO)) { if (pmap_demote_pte1(pmap, pte1p, va) && !pte1_is_wired(opte1)) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); pte2p = pmap_pte2_quick(pmap, va); opte2 = pte2_load(pte2p); if ((opte2 & PTE2_V)) { pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); vm_page_dirty(m); pmap_tlb_flush(pmap, va); } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" " a section in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); if (pte2_is_dirty(pte2_load(pte2p))) { pte2_set_bit(pte2p, PTE2_NM); pmap_tlb_flush(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { pt2_entry_t *cmap2_pte2p; vm_memattr_t oma; vm_paddr_t pa; struct pcpu *pc; oma = m->md.pat_mode; m->md.pat_mode = ma; CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, VM_PAGE_TO_PHYS(m), oma, ma); if ((m->flags & PG_FICTITIOUS) != 0) return; #if 0 /* * If "m" is a normal page, flush it from the cache. * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m, oma)) return; #endif /* * If page is not mapped by sf buffer, map the page * transient and do invalidation. */ if (ma != oma) { pa = VM_PAGE_TO_PHYS(m); sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, vm_memattr_to_pte2(ma))); dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } } /* * Miscellaneous support routines follow */ /* * Returns TRUE if the given page is mapped individually or as part of * a 1mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { pt2_entry_t *cmap2_pte2p; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); pagezero(pc->pc_cmap2_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pt2_entry_t *cmap2_pte2p; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap2_addr); else bzero(pc->pc_cmap2_addr + off, size); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap1_pte2p = pc->pc_cmap1_pte2p; cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap1_pte2p) != 0) panic("%s: CMAP1 busy", __func__); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), PTE2_AP_KRW, vm_page_pte2_attr(dst))); bcopy(pc->pc_cmap1_addr, pc->pc_cmap2_addr, PAGE_SIZE); pte2_clear(cmap1_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap1_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; struct pcpu *pc; int cnt; sched_pin(); pc = get_pcpu(); cmap1_pte2p = pc->pc_cmap1_pte2p; cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap1_pte2p) != 0) panic("pmap_copy_pages: CMAP1 busy"); if (pte2_load(cmap2_pte2p) != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); a_cp = pc->pc_cmap1_addr + a_pg_offset; b_cp = pc->pc_cmap2_addr + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } pte2_clear(cmap1_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap1_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } vm_offset_t pmap_quick_enter_page(vm_page_t m) { struct pcpu *pc; pt2_entry_t *pte2p; critical_enter(); pc = get_pcpu(); pte2p = pc->pc_qmap_pte2p; KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); return (pc->pc_qmap_addr); } void pmap_quick_remove_page(vm_offset_t addr) { struct pcpu *pc; pt2_entry_t *pte2p; pc = get_pcpu(); pte2p = pc->pc_qmap_pte2p; KASSERT(addr == pc->pc_qmap_addr, ("%s: invalid address", __func__)); KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); pte2_clear(pte2p); tlb_flush(pc->pc_qmap_addr); critical_exit(); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t nextva; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = nextva) { pt2_entry_t *src_pte2p, *dst_pte2p; vm_page_t dst_mpt2pg, src_mpt2pg; pt1_entry_t src_pte1; u_int pte1_idx; KASSERT(addr < VM_MAXUSER_ADDRESS, ("%s: invalid to pmap_copy page tables", __func__)); nextva = pte1_trunc(addr + PTE1_SIZE); if (nextva < addr) nextva = end_addr; pte1_idx = pte1_index(addr); src_pte1 = src_pmap->pm_pt1[pte1_idx]; if (pte1_is_section(src_pte1)) { if ((addr & PTE1_OFFSET) != 0 || (addr + PTE1_SIZE) > end_addr) continue; if (dst_pmap->pm_pt1[pte1_idx] == 0 && (!pte1_is_managed(src_pte1) || pmap_pv_insert_pte1(dst_pmap, addr, src_pte1, PMAP_ENTER_NORECLAIM))) { dst_pmap->pm_pt1[pte1_idx] = src_pte1 & ~PTE1_W; dst_pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; pmap_pte1_mappings++; } continue; } else if (!pte1_is_link(src_pte1)) continue; src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); /* * We leave PT2s to be linked from PT1 even if they are not * referenced until all PT2s in a page are without reference. * * QQQ: It could be changed ... */ #if 0 /* single_pt2_link_is_cleared */ KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, ("%s: source page table page is unused", __func__)); #else if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) continue; #endif if (nextva > end_addr) nextva = end_addr; src_pte2p = pt2map_entry(addr); while (addr < nextva) { pt2_entry_t temp_pte2; temp_pte2 = pte2_load(src_pte2p); /* * we only virtual copy managed pages */ if (pte2_is_managed(temp_pte2)) { dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dst_mpt2pg == NULL) goto out; dst_pte2p = pmap_pte2_quick(dst_pmap, addr); if (!pte2_is_valid(pte2_load(dst_pte2p)) && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ temp_pte2 &= ~(PTE2_W | PTE2_A); temp_pte2 |= PTE2_NM; pte2_store(dst_pte2p, temp_pte2); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_pt2(dst_pmap, addr, dst_mpt2pg, &free)) { pmap_tlb_flush(dst_pmap, addr); vm_page_free_pages_toq(&free, false); } goto out; } if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= pt2_wirecount_get(src_mpt2pg, pte1_idx)) break; } addr += PAGE_SIZE; src_pte2p++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more section mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t pte1_offset; if (size < PTE1_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); pte1_offset = offset & PTE1_OFFSET; if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || (*addr & PTE1_OFFSET) == pte1_offset) return; if ((*addr & PTE1_OFFSET) < pte1_offset) *addr = pte1_trunc(*addr) + pte1_offset; else *addr = pte1_roundup(*addr) + pte1_offset; } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid, ttb; PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif ttb = pmap_ttb_get(pmap); /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_pagedir = ttb; cp15_ttbr_set(ttb); PCPU_SET(curpmap, pmap); critical_exit(); } /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can * find other mappings. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; vm_paddr_t pa; bool managed; int val; PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, addr); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); managed = pte1_is_managed(pte1); val = MINCORE_PSIND(1) | MINCORE_INCORE; if (pte1_is_dirty(pte1)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if (pte1 & PTE1_A) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, addr); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); pa = pte2_pa(pte2); managed = pte2_is_managed(pte2); val = MINCORE_INCORE; if (pte2_is_dirty(pte2)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if (pte2 & PTE2_A) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } else { managed = false; val = 0; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { *pap = pa; } PMAP_UNLOCK(pmap); return (val); } void pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) { vm_offset_t sva; uint32_t l2attr; KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); sva = va; l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); while (size != 0) { pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } tlb_flush_range(sva, va - sva); } void pmap_kremove_device(vm_offset_t va, vm_size_t size) { vm_offset_t sva; KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); sva = va; while (size != 0) { pmap_kremove(va); va += PAGE_SIZE; size -= PAGE_SIZE; } tlb_flush_range(sva, va - sva); } void pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) { pcb->pcb_pagedir = pmap_ttb_get(pmap); } /* * Clean L1 data cache range by physical address. * The range must be within a single page. */ static void pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) { pt2_entry_t *cmap2_pte2p; struct pcpu *pc; KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, ("%s: not on single page", __func__)); sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Sync instruction cache range which is not mapped yet. */ void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) { uint32_t len, offset; vm_page_t m; /* Write back d-cache on given address range. */ offset = pa & PAGE_MASK; for ( ; size != 0; size -= len, pa += len, offset = 0) { len = min(PAGE_SIZE - offset, size); m = PHYS_TO_VM_PAGE(pa); KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", __func__, pa)); pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); } /* * I-cache is VIPT. Only way how to flush all virtual mappings * on given physical address is to invalidate all i-cache. */ icache_inv_all(); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) { /* Write back d-cache on given address range. */ if (va >= VM_MIN_KERNEL_ADDRESS) { dcache_wb_pou(va, size); } else { uint32_t len, offset; vm_paddr_t pa; vm_page_t m; offset = va & PAGE_MASK; for ( ; size != 0; size -= len, va += len, offset = 0) { pa = pmap_extract(pmap, va); /* offset is preserved */ len = min(PAGE_SIZE - offset, size); m = PHYS_TO_VM_PAGE(pa); KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", __func__, pa)); pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); } } /* * I-cache is VIPT. Only way how to flush all virtual mappings * on given physical address is to invalidate all i-cache. */ icache_inv_all(); } /* * The implementation of pmap_fault() uses IN_RANGE2() macro which * depends on the fact that given range size is a power of 2. */ CTASSERT(powerof2(NB_IN_PT1)); CTASSERT(powerof2(PT2MAP_SIZE)); #define IN_RANGE2(addr, start, size) \ ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) /* * Handle access and R/W emulation faults. */ int pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; if (pmap == NULL) pmap = kernel_pmap; /* * In kernel, we should never get abort with FAR which is in range of * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here * and print out a useful abort message and even get to the debugger * otherwise it likely ends with never ending loop of aborts. */ if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { /* * All L1 tables should always be mapped and present. * However, we check only current one herein. For user mode, * only permission abort from malicious user is not fatal. * And alignment abort as it may have higher priority. */ if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", __func__, pmap, pmap->pm_pt1, far); panic("%s: pm_pt1 abort", __func__); } return (KERN_INVALID_ADDRESS); } if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { /* * PT2MAP should be always mapped and present in current * L1 table. However, only existing L2 tables are mapped * in PT2MAP. For user mode, only L2 translation abort and * permission abort from malicious user is not fatal. * And alignment abort as it may have higher priority. */ if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", __func__, pmap, PT2MAP, far); panic("%s: PT2MAP abort", __func__); } return (KERN_INVALID_ADDRESS); } /* * A pmap lock is used below for handling of access and R/W emulation * aborts. They were handled by atomic operations before so some * analysis of new situation is needed to answer the following question: * Is it safe to use the lock even for these aborts? * * There may happen two cases in general: * * (1) Aborts while the pmap lock is locked already - this should not * happen as pmap lock is not recursive. However, under pmap lock only * internal kernel data should be accessed and such data should be * mapped with A bit set and NM bit cleared. If double abort happens, * then a mapping of data which has caused it must be fixed. Further, * all new mappings are always made with A bit set and the bit can be * cleared only on managed mappings. * * (2) Aborts while another lock(s) is/are locked - this already can * happen. However, there is no difference here if it's either access or * R/W emulation abort, or if it's some other abort. */ PMAP_LOCK(pmap); #ifdef INVARIANTS pte1 = pte1_load(pmap_pte1(pmap, far)); if (pte1_is_link(pte1)) { /* * Check in advance that associated L2 page table is mapped into * PT2MAP space. Note that faulty access to not mapped L2 page * table is caught in more general check above where "far" is * checked that it does not lay in PT2MAP space. Note also that * L1 page table and PT2TAB always exist and are mapped. */ pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); if (!pte2_is_valid(pte2)) panic("%s: missing L2 page table (%p, %#x)", __func__, pmap, far); } #endif #ifdef SMP /* * Special treatment is due to break-before-make approach done when * pte1 is updated for userland mapping during section promotion or * demotion. If not caught here, pmap_enter() can find a section * mapping on faulting address. That is not allowed. */ if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } #endif /* * Accesss bits for page and section. Note that the entry * is not in TLB yet, so TLB flush is not necessary. * * QQQ: This is hardware emulation, we do not call userret() * for aborts from user mode. */ if (idx == FAULT_ACCESS_L2) { pte1 = pte1_load(pmap_pte1(pmap, far)); if (pte1_is_link(pte1)) { /* L2 page table should exist and be mapped. */ pte2p = pt2map_entry(far); pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2)) { pte2_store(pte2p, pte2 | PTE2_A); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } else { /* * We got L2 access fault but PTE1 is not a link. * Probably some race happened, do nothing. */ CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L2 - pmap %#x far %#x", __func__, pmap, far); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } if (idx == FAULT_ACCESS_L1) { pte1p = pmap_pte1(pmap, far); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { pte1_store(pte1p, pte1 | PTE1_A); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } else { /* * We got L1 access fault but PTE1 is not section * mapping. Probably some race happened, do nothing. */ CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L1 - pmap %#x far %#x", __func__, pmap, far); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } /* * Handle modify bits for page and section. Note that the modify * bit is emulated by software. So PTEx_RO is software read only * bit and PTEx_NM flag is real hardware read only bit. * * QQQ: This is hardware emulation, we do not call userret() * for aborts from user mode. */ if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { pte1 = pte1_load(pmap_pte1(pmap, far)); if (pte1_is_link(pte1)) { /* L2 page table should exist and be mapped. */ pte2p = pt2map_entry(far); pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && (pte2 & PTE2_NM)) { pte2_store(pte2p, pte2 & ~PTE2_NM); tlb_flush(trunc_page(far)); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } else { /* * We got L2 permission fault but PTE1 is not a link. * Probably some race happened, do nothing. */ CTR3(KTR_PMAP, "%s: FAULT_PERM_L2 - pmap %#x far %#x", __func__, pmap, far); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { pte1p = pmap_pte1(pmap, far); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { if (!(pte1 & PTE1_RO) && (pte1 & PTE1_NM)) { pte1_store(pte1p, pte1 & ~PTE1_NM); tlb_flush(pte1_trunc(far)); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } else { /* * We got L1 permission fault but PTE1 is not section * mapping. Probably some race happened, do nothing. */ CTR3(KTR_PMAP, "%s: FAULT_PERM_L1 - pmap %#x far %#x", __func__, pmap, far); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } /* * QQQ: The previous code, mainly fast handling of access and * modify bits aborts, could be moved to ASM. Now we are * starting to deal with not fast aborts. */ PMAP_UNLOCK(pmap); return (KERN_FAILURE); } #if defined(PMAP_DEBUG) /* * Reusing of KVA used in pmap_zero_page function !!! */ static void pmap_zero_page_check(vm_page_t m) { pt2_entry_t *cmap2_pte2p; uint32_t *p, *end; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) if (*p != 0) panic("%s: page %p not zero, va: %p", __func__, m, pc->pc_cmap2_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } int pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte2 = 0; int i, j, index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid || p->p_vmspace == NULL) continue; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPTE1_IN_PT1; i++) { pt1_entry_t pte1; pt2_entry_t *pte2p, pte2; vm_offset_t base, va; vm_paddr_t pa; vm_page_t m; base = i << PTE1_SHIFT; pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1_is_section(pte1)) { /* * QQQ: Do something here! */ } else if (pte1_is_link(pte1)) { for (j = 0; j < NPTE2_IN_PT2; j++) { va = base + (j << PAGE_SHIFT); if (va >= VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte2); } pte2p = pmap_pte2(pmap, va); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); if (!pte2_is_valid(pte2)) continue; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pa: 0x%x, w: %d, " "f: 0x%x", va, pa, m->ref_count, m->flags); npte2++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } sx_sunlock(&allproc_lock); return (npte2); } #endif #ifdef DDB static pt2_entry_t * pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (!pte1_is_link(pte1)) return (NULL); if (pmap_is_current(pmap)) return (pt2map_entry(va)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP3cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR3); } #ifdef SMP else if (PMAP3cpu != PCPU_GET(cpuid)) { PMAP3cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR3); } #endif return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } static void dump_pmap(pmap_t pmap) { printf("pmap %p\n", pmap); printf(" pm_pt1: %p\n", pmap->pm_pt1); printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); } DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) { pmap_t pmap; LIST_FOREACH(pmap, &allpmaps, pm_list) { dump_pmap(pmap); } } static int pte2_class(pt2_entry_t pte2) { int cls; cls = (pte2 >> 2) & 0x03; cls |= (pte2 >> 4) & 0x04; return (cls); } static void dump_section(pmap_t pmap, uint32_t pte1_idx) { } static void dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) { uint32_t i; vm_offset_t va; pt2_entry_t *pte2p, pte2; vm_page_t m; va = pte1_idx << PTE1_SHIFT; pte2p = pmap_pte2_ddb(pmap, va); for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (pte2 == 0) continue; if (!pte2_is_valid(pte2)) { printf(" 0x%08X: 0x%08X", va, pte2); if (!invalid_ok) printf(" - not valid !!!"); printf("\n"); continue; } m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); if (m != NULL) { printf(" v:%d w:%d f:0x%04X\n", m->valid, m->ref_count, m->flags); } else { printf("\n"); } } } static __inline boolean_t is_pv_chunk_space(vm_offset_t va) { if ((((vm_offset_t)pv_chunkbase) <= va) && (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) return (TRUE); return (FALSE); } DB_SHOW_COMMAND(pmap, pmap_pmap_print) { /* XXX convert args. */ pmap_t pmap = (pmap_t)addr; pt1_entry_t pte1; pt2_entry_t pte2; vm_offset_t va, eva; vm_page_t m; uint32_t i; boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; if (have_addr) { pmap_t pm; LIST_FOREACH(pm, &allpmaps, pm_list) if (pm == pmap) break; if (pm == NULL) { printf("given pmap %p is not in allpmaps list\n", pmap); return; } } else pmap = PCPU_GET(curpmap); eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ printf("pmap: 0x%08X\n", (uint32_t)pmap); printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); for(i = 0; i < NPTE1_IN_PT1; i++) { pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1 == 0) continue; va = i << PTE1_SHIFT; if (va >= eva) break; if (pte1_is_section(pte1)) { printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); dump_section(pmap, i); } else if (pte1_is_link(pte1)) { dump_link_ok = TRUE; invalid_ok = FALSE; pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", va, pte1, pte2, m); if (is_pv_chunk_space(va)) { printf(" - pv_chunk space"); if (dump_pv_chunk) invalid_ok = TRUE; else dump_link_ok = FALSE; } else if (m != NULL) printf(" w:%d w2:%u", m->ref_count, pt2_wirecount_get(m, pte1_index(va))); if (pte2 == 0) printf(" !!! pt2tab entry is ZERO"); else if (pte2_pa(pte1) != pte2_pa(pte2)) printf(" !!! pt2tab entry is DIFFERENT - m: %p", PHYS_TO_VM_PAGE(pte2_pa(pte2))); printf("\n"); if (dump_link_ok) dump_link(pmap, i, invalid_ok); } else printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); } } static void dump_pt2tab(pmap_t pmap) { uint32_t i; pt2_entry_t pte2; vm_offset_t va; vm_paddr_t pa; vm_page_t m; printf("PT2TAB:\n"); for (i = 0; i < PT2TAB_ENTRIES; i++) { pte2 = pte2_load(&pmap->pm_pt2tab[i]); if (!pte2_is_valid(pte2)) continue; va = i << PT2TAB_SHIFT; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, pte2_class(pte2), !!(pte2 & PTE2_S), m); if (m != NULL) printf(" , w: %d, f: 0x%04X pidx: %lld", m->ref_count, m->flags, m->pindex); printf("\n"); } } DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) { /* XXX convert args. */ pmap_t pmap = (pmap_t)addr; pt1_entry_t pte1; pt2_entry_t pte2; vm_offset_t va; uint32_t i, start; if (have_addr) { printf("supported only on current pmap\n"); return; } pmap = PCPU_GET(curpmap); printf("curpmap: 0x%08X\n", (uint32_t)pmap); printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); start = pte1_index((vm_offset_t)PT2MAP); for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1 == 0) continue; va = i << PTE1_SHIFT; if (pte1_is_section(pte1)) { printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, !!(pte1 & PTE1_S)); dump_section(pmap, i); } else if (pte1_is_link(pte1)) { pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, pte1, pte2); if (pte2 == 0) printf(" !!! pt2tab entry is ZERO\n"); } else printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); } dump_pt2tab(pmap); } #endif diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index 9fbd473abe3a..ab62ca2934f0 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -1,7173 +1,7161 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014-2016 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * This software was developed by Andrew Turner under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) #define NUL0E L0_ENTRIES #define NUL1E (NUL0E * NL1PG) #define NUL2E (NUL1E * NL2PG) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) static struct md_page * pa_to_pvh(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return ((struct md_page *)seg->md_first + pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); } panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); } static struct md_page * page_to_pvh(vm_page_t m) { struct vm_phys_seg *seg; seg = &vm_phys_segs[m->segind]; return ((struct md_page *)seg->md_first + pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); } #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* * The presence of this flag indicates that the mapping is writeable. * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise * it is dirty. This flag may only be set on managed mappings. * * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it * as a software managed bit. */ #define ATTR_SW_DBM ATTR_DBM struct pmap kernel_pmap_store; /* Used for mapping ACPI memory before VM is initialized */ #define PMAP_PREINIT_MAPPING_COUNT 32 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ static int vm_initialized = 0; /* No need to use pre-init maps when set */ /* * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. * Always map entire L2 block for simplicity. * VA of L2 block = preinit_map_va + i * L2_SIZE */ static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t size; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; /* * Data for the pv entry allocation mechanism. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) extern pt_entry_t pagetable_dmap[]; #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) static vm_paddr_t physmap[PHYSMAP_SIZE]; static u_int physmap_idx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM/pmap parameters"); /* * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs * that it has currently allocated to a pmap, a cursor ("asid_next") to * optimize its search for a free ASID in the bit vector, and an epoch number * ("asid_epoch") to indicate when it has reclaimed all previously allocated * ASIDs that are not currently active on a processor. * * The current epoch number is always in the range [0, INT_MAX). Negative * numbers and INT_MAX are reserved for special cases that are described * below. */ struct asid_set { int asid_bits; bitstr_t *asid_set; int asid_set_size; int asid_next; int asid_epoch; struct mtx asid_set_mutex; }; static struct asid_set asids; static struct asid_set vmids; static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "ASID allocator"); SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, "The number of bits in an ASID"); SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, "The last allocated ASID plus one"); SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, "The current epoch number"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, "The number of bits in an VMID"); SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, "The last allocated VMID plus one"); SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, "The current epoch number"); void (*pmap_clean_stage2_tlbi)(void); void (*pmap_invalidate_vpipt_icache)(void); /* * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for * dynamically allocated ASIDs have a non-negative epoch number. * * An invalid ASID is represented by -1. * * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), * which indicates that an ASID should never be allocated to the pmap, and * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be * allocated when the pmap is next activated. */ #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ ((u_long)(epoch) << 32))) #define COOKIE_TO_ASID(cookie) ((int)(cookie)) #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, "Are large page mappings enabled?"); /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); static bool pmap_activate_int(pmap_t pmap); static void pmap_alloc_asid(pmap_t pmap); static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, int mode); static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp); static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); static void pmap_reset_asid_set(pmap_t pmap); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_clear(table) atomic_store_64(table, 0) #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) #define pmap_load(table) (*table) #define pmap_load_clear(table) atomic_swap_64(table, 0) #define pmap_load_store(table, entry) atomic_swap_64(table, entry) #define pmap_set_bits(table, bits) atomic_set_64(table, bits) #define pmap_store(table, entry) atomic_store_64(table, entry) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l0[pmap_l0_index(va)]); } static __inline pd_entry_t * pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) { pd_entry_t *l1; l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); return (&l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { pd_entry_t *l0; l0 = pmap_l0(pmap, va); if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) return (NULL); return (pmap_l0_to_l1(l0, va)); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) { pd_entry_t l1, *l2p; l1 = pmap_load(l1p); KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); /* * The valid bit may be clear if pmap_update_entry() is concurrently * modifying the entry, so for KVA only the entry type may be checked. */ KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0, ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK); return (&l2p[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) { pd_entry_t l2; pt_entry_t *l3p; l2 = pmap_load(l2p); KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); /* * The valid bit may be clear if pmap_update_entry() is concurrently * modifying the entry, so for KVA only the entry type may be checked. */ KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0, ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK); return (&l3p[pmap_l3_index(va)]); } /* * Returns the lowest valid pde for a given virtual address. * The next level may or may not point to a valid page or block. */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l0, *l1, *l2, desc; l0 = pmap_l0(pmap, va); desc = pmap_load(l0) & ATTR_DESCR_MASK; if (desc != L0_TABLE) { *level = -1; return (NULL); } l1 = pmap_l0_to_l1(l0, va); desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc != L1_TABLE) { *level = 0; return (l0); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc != L2_TABLE) { *level = 1; return (l1); } *level = 2; return (l2); } /* * Returns the lowest valid pte block or table entry for a given virtual * address. If there are no valid entries return NULL and set the level to * the first invalid level. */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l1, *l2, desc; pt_entry_t *l3; l1 = pmap_l1(pmap, va); if (l1 == NULL) { *level = 0; return (NULL); } desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc == L1_BLOCK) { *level = 1; return (l1); } if (desc != L1_TABLE) { *level = 1; return (NULL); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc == L2_BLOCK) { *level = 2; return (l2); } if (desc != L2_TABLE) { *level = 2; return (NULL); } *level = 3; l3 = pmap_l2_to_l3(l2, va); if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) return (NULL); return (l3); } bool pmap_ps_enabled(pmap_t pmap __unused) { return (superpages_enabled != 0); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l0p, *l1p, *l2p; if (pmap->pm_l0 == NULL) return (false); l0p = pmap_l0(pmap, va); *l0 = l0p; if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) return (false); l1p = pmap_l0_to_l1(l0p, va); *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { *l2 = NULL; *l3 = NULL; return (true); } if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) return (false); l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { *l3 = NULL; return (true); } if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) return (false); *l3 = pmap_l2_to_l3(l2p, va); return (true); } static __inline int pmap_l3_valid(pt_entry_t l3) { return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); } CTASSERT(L1_BLOCK == L2_BLOCK); static pt_entry_t pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) { pt_entry_t val; if (pmap->pm_stage == PM_STAGE1) { val = ATTR_S1_IDX(memattr); if (memattr == VM_MEMATTR_DEVICE) val |= ATTR_S1_XN; return (val); } val = 0; switch (memattr) { case VM_MEMATTR_DEVICE: return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | ATTR_S2_XN(ATTR_S2_XN_ALL)); case VM_MEMATTR_UNCACHEABLE: return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); case VM_MEMATTR_WRITE_BACK: return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); case VM_MEMATTR_WRITE_THROUGH: return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); default: panic("%s: invalid memory attribute %x", __func__, memattr); } } static pt_entry_t pmap_pte_prot(pmap_t pmap, vm_prot_t prot) { pt_entry_t val; val = 0; if (pmap->pm_stage == PM_STAGE1) { if ((prot & VM_PROT_EXECUTE) == 0) val |= ATTR_S1_XN; if ((prot & VM_PROT_WRITE) == 0) val |= ATTR_S1_AP(ATTR_S1_AP_RO); } else { if ((prot & VM_PROT_WRITE) != 0) val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); if ((prot & VM_PROT_READ) != 0) val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); if ((prot & VM_PROT_EXECUTE) == 0) val |= ATTR_S2_XN(ATTR_S2_XN_ALL); } return (val); } /* * Checks if the PTE is dirty. */ static inline int pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) { KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); if (pmap->pm_stage == PM_STAGE1) { KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); } return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { vm_paddr_t pa_page; pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK; return (pa_page | (va & PAR_LOW_MASK)); } static vm_offset_t pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_offset_t freemempos) { pt_entry_t *l2; vm_offset_t va; vm_paddr_t l2_pa, pa; u_int l1_slot, l2_slot, prev_l1_slot; int i; dmap_phys_base = min_pa & ~L1_OFFSET; dmap_phys_max = 0; dmap_max_addr = 0; l2 = NULL; prev_l1_slot = -1; #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES); for (i = 0; i < (physmap_idx * 2); i += 2) { pa = physmap[i] & ~L2_OFFSET; va = pa - dmap_phys_base + DMAP_MIN_ADDRESS; /* Create L2 mappings at the start of the region */ if ((pa & L1_OFFSET) != 0) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | TATTR_PXN_TABLE | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { /* * We are on a boundary, stop to * create a level 1 block */ if ((pa & L1_OFFSET) == 0) break; l2_slot = pmap_l2_index(va); KASSERT(l2_slot != 0, ("...")); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); } KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS), ("...")); } for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] && (physmap[i + 1] - pa) >= L1_SIZE; pa += L1_SIZE, va += L1_SIZE) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); pmap_store(&pagetable_dmap[l1_slot], (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L1_BLOCK); } /* Create L2 mappings at the end of the region */ if (pa < physmap[i + 1]) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { l2_slot = pmap_l2_index(va); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); } } if (pa > dmap_phys_max) { dmap_phys_max = pa; dmap_max_addr = va; } } cpu_tlb_flushID(); return (freemempos); } static vm_offset_t pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) { vm_offset_t l2pt; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); l1 = (pd_entry_t *)l1pt; l1_slot = pmap_l1_index(va); l2pt = l2_start; for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); pa = pmap_early_vtophys(l1pt, l2pt); pmap_store(&l1[l1_slot], (pa & ~Ln_TABLE_MASK) | L1_TABLE); l2pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l2_start, 0, l2pt - l2_start); return l2pt; } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l3pt; vm_paddr_t pa; pd_entry_t *l2; u_int l2_slot; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pmap_store(&l2[l2_slot], (pa & ~Ln_TABLE_MASK) | ATTR_S1_UXN | L2_TABLE); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return l3pt; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { vm_offset_t freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t start_pa, pa, min_pa; uint64_t kern_delta; int i; /* Verify that the ASID is set through TTBR0. */ KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0, ("pmap_bootstrap: TCR_EL1.A1 != 0")); kern_delta = KERNBASE - kernstart; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_l0_paddr = l0pt - kern_delta; kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); kernel_pmap->pm_stage = PM_STAGE1; kernel_pmap->pm_levels = 4; kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr; kernel_pmap->pm_asid_set = &asids; /* Assume the address we were loaded to is a valid physical address */ min_pa = KERNBASE - kern_delta; physmap_idx = physmem_avail(physmap, nitems(physmap)); physmap_idx /= 2; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < physmap_idx * 2; i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; } freemempos = KERNBASE + kernlen; freemempos = roundup2(freemempos, PAGE_SIZE); /* Create a direct map region early so we can use it for pa -> va */ freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos); start_pa = pa = KERNBASE - kern_delta; /* * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the * loader allocated the first and only l2 page table page used to map * the kernel, preloaded files and module metadata. */ freemempos = pmap_bootstrap_l2(l1pt, KERNBASE + L1_SIZE, freemempos); /* And the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos); cpu_tlb_flushID(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; /* Reserve some VA space for early BIOS/ACPI mapping */ preinit_map_va = roundup2(freemempos, L2_SIZE); virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; virtual_avail = roundup2(virtual_avail, L1_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); cpu_tlb_flushID(); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } static void pmap_init_asids(struct asid_set *set, int bits) { int i; set->asid_bits = bits; /* * We may be too early in the overall initialization process to use * bit_alloc(). */ set->asid_set_size = 1 << set->asid_bits; set->asid_set = (bitstr_t *)kmem_malloc(bitstr_size(set->asid_set_size), M_WAITOK | M_ZERO); for (i = 0; i < ASID_FIRST_AVAILABLE; i++) bit_set(set->asid_set, i); set->asid_next = ASID_FIRST_AVAILABLE; mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct vm_phys_seg *seg, *next_seg; struct md_page *pvh; vm_size_t s; uint64_t mmfr1; int i, pv_npg, vmid_bits; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); if (superpages_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = L2_SIZE; KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, ("pmap_init: can't assign to pagesizes[2]")); pagesizes[2] = L1_SIZE; } /* * Initialize the ASID allocator. */ pmap_init_asids(&asids, (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); if (has_hyp()) { mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); vmid_bits = 8; if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == ID_AA64MMFR1_VMIDBits_16) vmid_bits = 16; pmap_init_asids(&vmids, vmid_bits); } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = 0; for (i = 0; i < vm_phys_nsegs; i++) { seg = &vm_phys_segs[i]; pv_npg += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - pmap_l2_pindex(seg->start); } /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); /* * Set pointers from vm_phys_segs to pv_table. */ for (i = 0, pvh = pv_table; i < vm_phys_nsegs; i++) { seg = &vm_phys_segs[i]; seg->md_first = pvh; pvh += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - pmap_l2_pindex(seg->start); /* * If there is a following segment, and the final * superpage of this segment and the initial superpage * of the next segment are the same then adjust the * pv_table entry for that next segment down by one so * that the pv_table entries will be shared. */ if (i + 1 < vm_phys_nsegs) { next_seg = &vm_phys_segs[i + 1]; if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == pmap_l2_pindex(next_seg->start)) { pvh--; } } } vm_initialized = 1; } static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_mappings; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l2_mappings, 0, "2MB page mappings"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Invalidate a single TLB entry. */ static __inline void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { uint64_t r; PMAP_ASSERT_STAGE1(pmap); dsb(ishst); if (pmap == kernel_pmap) { r = atop(va); __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); } else { r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | atop(va); __asm __volatile("tlbi vae1is, %0" : : "r" (r)); } dsb(ish); isb(); } static __inline void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { uint64_t end, r, start; PMAP_ASSERT_STAGE1(pmap); dsb(ishst); if (pmap == kernel_pmap) { start = atop(sva); end = atop(eva); for (r = start; r < end; r++) __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); } else { start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); start |= atop(sva); end |= atop(eva); for (r = start; r < end; r++) __asm __volatile("tlbi vae1is, %0" : : "r" (r)); } dsb(ish); isb(); } static __inline void pmap_invalidate_all(pmap_t pmap) { uint64_t r; PMAP_ASSERT_STAGE1(pmap); dsb(ishst); if (pmap == kernel_pmap) { __asm __volatile("tlbi vmalle1is"); } else { r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); __asm __volatile("tlbi aside1is, %0" : : "r" (r)); } dsb(ish); isb(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; pa = 0; PMAP_LOCK(pmap); /* * Find the block or page map for this virtual address. pmap_pte * will return either a valid block/page entry, or NULL. */ pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_extract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_extract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_extract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *pte, tpte; vm_offset_t off; vm_page_t m; int lvl; bool use; m = NULL; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); KASSERT(lvl > 0 && lvl <= 3, ("pmap_extract_and_hold: Invalid level %d", lvl)); CTASSERT(L1_BLOCK == L2_BLOCK); KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, tpte & ATTR_DESCR_MASK)); use = false; if ((prot & VM_PROT_WRITE) == 0) use = true; else if (pmap->pm_stage == PM_STAGE1 && (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) use = true; else if (pmap->pm_stage == PM_STAGE2 && ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) use = true; if (use) { switch (lvl) { case 1: off = va & L1_OFFSET; break; case 2: off = va & L2_OFFSET; break; case 3: default: off = 0; } m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); if (m != NULL && !vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } /* * Walks the page tables to translate a kernel virtual address to a * physical address. Returns true if the kva is valid and stores the * physical address in pa if it is not NULL. */ bool pmap_klookup(vm_offset_t va, vm_paddr_t *pa) { pt_entry_t *pte, tpte; register_t intr; uint64_t par; /* * Disable interrupts so we don't get interrupted between asking * for address translation, and getting the result back. */ intr = intr_disable(); par = arm64_address_translate_s1e1r(va); intr_restore(intr); if (PAR_SUCCESS(par)) { if (pa != NULL) *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK); return (true); } /* * Fall back to walking the page table. The address translation * instruction may fail when the page is in a break-before-make * sequence. As we only clear the valid bit in said sequence we * can walk the page table to find the physical address. */ pte = pmap_l1(kernel_pmap, va); if (pte == NULL) return (false); /* * A concurrent pmap_update_entry() will clear the entry's valid bit * but leave the rest of the entry unchanged. Therefore, we treat a * non-zero entry as being valid, and we ignore the valid bit when * determining whether the entry maps a block, page, or table. */ tpte = pmap_load(pte); if (tpte == 0) return (false); if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { if (pa != NULL) *pa = (tpte & ~ATTR_MASK) | (va & L1_OFFSET); return (true); } pte = pmap_l1_to_l2(&tpte, va); tpte = pmap_load(pte); if (tpte == 0) return (false); if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { if (pa != NULL) *pa = (tpte & ~ATTR_MASK) | (va & L2_OFFSET); return (true); } pte = pmap_l2_to_l3(&tpte, va); tpte = pmap_load(pte); if (tpte == 0) return (false); if (pa != NULL) *pa = (tpte & ~ATTR_MASK) | (va & L3_OFFSET); return (true); } vm_paddr_t pmap_kextract(vm_offset_t va) { vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return (DMAP_TO_PHYS(va)); if (pmap_klookup(va, &pa) == false) return (0); return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) { pd_entry_t *pde; pt_entry_t *pte, attr; vm_offset_t va; int lvl; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter: Mapping is not page-sized")); attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | ATTR_S1_IDX(mode) | L3_PAGE; va = sva; while (size != 0) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); } /* * Remove a page from the kernel pagetables. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; int lvl; pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); pmap_clear(pte); pmap_invalidate_page(kernel_pmap, va); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); pmap_clear(pte); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pd_entry_t *pde; pt_entry_t *pte, pa; vm_offset_t va; vm_page_t m; int i, lvl; va = sva; for (i = 0; i < count; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_qenter: Invalid level %d", lvl)); m = ma[i]; pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, pa); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT(ADDR_IS_CANONICAL(sva), ("%s: Address not in canonical form: %lx", __func__, sva)); KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva)); va = sva; while (count-- > 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); if (pte != NULL) { pmap_clear(pte); } va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ pd_entry_t *l0; l0 = pmap_l0(pmap, va); pmap_clear(l0); } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); } else { /* l3 page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { /* We just released an l3, unhold the matching l2 */ pd_entry_t *l1, tl1; vm_page_t l2pg; l1 = pmap_l1(pmap, va); tl1 = pmap_load(l1); l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ pd_entry_t *l0, tl0; vm_page_t l1pg; l0 = pmap_l0(pmap, va); tl0 = pmap_load(l0); l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); if (ADDR_IS_KERNEL(va)) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); return (pmap_unwire_l3(pmap, va, mpte, free)); } /* * Release a page table page reference after a failed attempt to create a * mapping. */ static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { struct spglist free; SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, mpte, &free)) { /* * Although "va" was never mapped, the TLB could nonetheless * have intermediate entries that refer to the freed page * table pages. Invalidate those entries. * * XXX redundant invalidation (See _pmap_unwire_l3().) */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); pmap->pm_root.rt_root = 0; pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); pmap->pm_stage = PM_STAGE1; pmap->pm_levels = 4; pmap->pm_ttbr = pmap->pm_l0_paddr; pmap->pm_asid_set = &asids; PCPU_SET(curpmap, pmap); } int pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels) { vm_page_t m; /* * allocate the l0 page */ - while ((m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) - vm_wait(NULL); - + m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); - if ((m->flags & PG_ZERO) == 0) - pagezero(pmap->pm_l0); - pmap->pm_root.rt_root = 0; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); MPASS(levels == 3 || levels == 4); pmap->pm_levels = levels; pmap->pm_stage = stage; switch (stage) { case PM_STAGE1: pmap->pm_asid_set = &asids; break; case PM_STAGE2: pmap->pm_asid_set = &vmids; break; default: panic("%s: Invalid pmap type %d", __func__, stage); break; } /* XXX Temporarily disable deferred ASID allocation. */ pmap_alloc_asid(pmap); /* * Allocate the level 1 entry to use as the root. This will increase * the refcount on the level 1 page so it won't be removed until * pmap_release() is called. */ if (pmap->pm_levels == 3) { PMAP_LOCK(pmap); m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL); PMAP_UNLOCK(pmap); } pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); return (1); } int pmap_pinit(pmap_t pmap) { return (pmap_pinit_stage(pmap, PM_STAGE1, 4)); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, l1pg, l2pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ - if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(m); + m->pindex = ptepindex; /* * Because of AArch64's weak memory consistency model, we must have a * barrier here to ensure that the stores for zeroing "m", whether by * pmap_zero_page() or an earlier function, are visible before adding * "m" to the page table. Otherwise, a page table walk by another * processor's MMU could see the mapping to "m" and a stale, non-zero * PTE within "m". */ dmb(ishst); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUL2E + NUL1E)) { pd_entry_t *l0p, l0e; vm_pindex_t l0index; l0index = ptepindex - (NUL2E + NUL1E); l0p = &pmap->pm_l0[l0index]; KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0, ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p))); l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE; /* * Mark all kernel memory as not accessible from userspace * and userspace memory as not executable from the kernel. * This has been done for the bootstrap L0 entries in * locore.S. */ if (pmap == kernel_pmap) l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0; else l0e |= TATTR_PXN_TABLE; pmap_store(l0p, l0e); } else if (ptepindex >= NUL2E) { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1; pd_entry_t tl0; l1index = ptepindex - NUL2E; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); l1pg->ref_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); l1 = &l1[ptepindex & Ln_ADDR_MASK]; KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); } else { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1, *l2; pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } tl0 = pmap_load(l0); l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; } else { l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); l2pg->ref_count++; } } l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); l2 = &l2[ptepindex & Ln_ADDR_MASK]; KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0, ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); } pmap_resident_count_inc(pmap, 1); return (m); } static pd_entry_t * pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, struct rwlock **lockp) { pd_entry_t *l1, *l2; vm_page_t l2pg; vm_pindex_t l2pindex; KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); retry: l1 = pmap_l1(pmap, va); if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { l2 = pmap_l1_to_l2(l1, va); if (!ADDR_IS_KERNEL(va)) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); l2pg->ref_count++; } else l2pg = NULL; } else if (!ADDR_IS_KERNEL(va)) { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); if (l2pg == NULL) { if (lockp != NULL) goto retry; else return (NULL); } l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; } else panic("pmap_alloc_l2: missing page table page for va %#lx", va); *l2pgp = l2pg; return (l2); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pde, tpde; #ifdef INVARIANTS pt_entry_t *pte; #endif vm_page_t m; int lvl; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment the hold count, * and activate it. If we get a level 2 pde it will point to a level 3 * table. */ switch (lvl) { case -1: break; case 0: #ifdef INVARIANTS pte = pmap_l0_to_l1(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l0 superpages")); #endif break; case 1: #ifdef INVARIANTS pte = pmap_l1_to_l2(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l1 superpages")); #endif break; case 2: tpde = pmap_load(pde); if (tpde != 0) { m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); m->ref_count++; return (m); } break; default: panic("pmap_alloc_l3: Invalid level %d", lvl); } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { boolean_t rv; struct spglist free; struct asid_set *set; vm_page_t m; int asid; if (pmap->pm_levels != 4) { PMAP_ASSERT_STAGE2(pmap); KASSERT(pmap->pm_stats.resident_count == 1, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); SLIST_INIT(&free); m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); PMAP_LOCK(pmap); rv = pmap_unwire_l3(pmap, 0, m, &free); PMAP_UNLOCK(pmap); MPASS(rv == TRUE); vm_page_free_pages_toq(&free, true); } KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); set = pmap->pm_asid_set; KASSERT(set != NULL, ("%s: NULL asid set", __func__)); /* * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate * the entries when removing them so rely on a later tlb invalidation. * this will happen when updating the VMID generation. Because of this * we don't reuse VMIDs within a generation. */ if (pmap->pm_stage == PM_STAGE1) { mtx_lock_spin(&set->asid_set_mutex); if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { asid = COOKIE_TO_ASID(pmap->pm_cookie); KASSERT(asid >= ASID_FIRST_AVAILABLE && asid < set->asid_set_size, ("pmap_release: pmap cookie has out-of-range asid")); bit_clear(set->asid_set, asid); } mtx_unlock_spin(&set->asid_set_mutex); } m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); vm_page_unwire_noq(m); vm_page_free_zero(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l0, *l1, *l2; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l0 = pmap_l0(kernel_pmap, kernel_vm_end); KASSERT(pmap_load(l0) != 0, ("pmap_growkernel: No level 0 kernel entry")); l1 = pmap_l0_to_l1(l0, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ - nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | + nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); - if ((nkpg->flags & PG_ZERO) == 0) - pmap_zero_page(nkpg); + nkpg->pindex = kernel_vm_end >> L1_SHIFT; /* See the dmb() in _pmap_alloc_l3(). */ dmb(ishst); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_store(l1, paddr | L1_TABLE); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if (pmap_load(l2) != 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } - nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); - if ((nkpg->flags & PG_ZERO) == 0) - pmap_zero_page(nkpg); + nkpg->pindex = kernel_vm_end >> L2_SHIFT; /* See the dmb() in _pmap_alloc_l3(). */ dmb(ishst); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_store(l2, paddr | L2_TABLE); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed, lvl; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pv_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va, &lvl); if (lvl != 2) continue; pte = pmap_l2_to_l3(pde, va); tpte = pmap_load(pte); if ((tpte & ATTR_SW_WIRED) != 0) continue; tpte = pmap_load_clear(pte); m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); if (pmap_pte_dirty(pmap, tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { pmap_invalidate_page(pmap, va); vm_page_aflag_set(m, PGA_REFERENCED); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, pmap_load(pde), &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->ref_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((va & L2_OFFSET) == 0, ("pmap_pv_demote_l2: va is not 2mpage aligned")); KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_demote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = l2e & ~ATTR_MASK; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { pt_entry_t newl2, oldl2; vm_page_t ml3; vm_paddr_t ml3pa; KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); if (ml3 == NULL) panic("pmap_remove_kernel_l2: Missing pt page"); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = ml3pa | L2_TABLE; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (ml3->valid != 0) pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. The caller must have already invalidated the * mapping (i.e., the "break" in break-before-make). */ oldl2 = pmap_load_store(l2, newl2); KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", __func__, l2, oldl2)); } /* * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l2; vm_page_t m, ml3, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); old_l2 = pmap_load_clear(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); if (old_l2 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if (old_l2 & ATTR_SW_MANAGED) { m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); pvh = page_to_pvh(m); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); pmap_pvh_free(pvh, pmap, sva); for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) { if (pmap_pte_dirty(pmap, old_l2)) vm_page_dirty(mt); if (old_l2 & ATTR_AF) vm_page_aflag_set(mt, PGA_REFERENCED); if (TAILQ_EMPTY(&mt->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l2(pmap, l2, sva); } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->ref_count == NL3PG, ("pmap_remove_l2: l3 page ref count error")); ml3->ref_count = 0; pmap_add_delayed_free_list(ml3, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l3; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & ATTR_SW_MANAGED) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(pmap, old_l3)) vm_page_dirty(m); if (old_l3 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the specified range of addresses from the L3 page table that is * identified by the given L2 entry. */ static void pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, vm_offset_t eva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; struct rwlock *new_lock; pt_entry_t *l3, old_l3; vm_offset_t va; vm_page_t l3pg, m; KASSERT(ADDR_IS_CANONICAL(sva), ("%s: Start address not in canonical form: %lx", __func__, sva)); KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS, ("%s: End address not in canonical form: %lx", __func__, eva)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), ("pmap_remove_l3_range: range crosses an L3 page table boundary")); l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : NULL; va = eva; for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { if (!pmap_l3_valid(pmap_load(l3))) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } old_l3 = pmap_load_clear(l3); if ((old_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; pmap_resident_count_dec(pmap, 1); if ((old_l3 & ATTR_SW_MANAGED) != 0) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(pmap, old_l3)) vm_page_dirty(m); if ((old_l3 & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); if (new_lock != *lockp) { if (*lockp != NULL) { /* * Pending TLB invalidations must be * performed before the PV list lock is * released. Otherwise, a concurrent * pmap_remove_all() on a physical page * could return while a stale TLB entry * still provides access to that page. */ if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } rw_wunlock(*lockp); } *lockp = new_lock; rw_wlock(*lockp); } pmap_pvh_free(&m->md, pmap, sva); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (va == eva) va = sva; if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { sva += L3_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t l3_paddr; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G page " "l1 %#lx sva %#lx eva %#lx va_next %#lx", pmap_load(l1), sva, eva, va_next)); MPASS(pmap != kernel_pmap); MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); pmap_clear(l1); pmap_invalidate_page(pmap, sva); pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; l3_paddr = pmap_load(l2); if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (pmap_demote_l2_locked(pmap, l2, sva, &lock) == NULL) continue; l3_paddr = pmap_load(l2); } /* * Weed out invalid mappings. */ if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, &lock); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; struct spglist free; int lvl, pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); rw_wlock(lock); retry: while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pte = pmap_pte(pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_remove_all: no page table entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pte level %d", lvl)); pmap_demote_l2_locked(pmap, pte, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_remove_all: no page directory entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pde level %d", lvl)); tpde = pmap_load(pde); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load_clear(pte); if (tpte & ATTR_SW_WIRED) pmap->pm_stats.wired_count--; if ((tpte & ATTR_AF) != 0) { pmap_invalidate_page(pmap, pv->pv_va); vm_page_aflag_set(m, PGA_REFERENCED); } /* * Update the vm_page_t clean and reference bits. */ if (pmap_pte_dirty(pmap, tpte)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_l2: do the things to protect a 2MB page in a pmap */ static void pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, pt_entry_t nbits) { pd_entry_t old_l2; vm_page_t m, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); KASSERT((sva & L2_OFFSET) == 0, ("pmap_protect_l2: sva is not 2mpage aligned")); old_l2 = pmap_load(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); /* * Return if the L2 entry already has the desired access restrictions * in place. */ if ((old_l2 & mask) == nbits) return; while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) cpu_spinwait(); /* * When a dirty read/write superpage mapping is write protected, * update the dirty field of each of the superpage's constituent 4KB * pages. */ if ((old_l2 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && pmap_pte_dirty(pmap, old_l2)) { m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3p, l3, mask, nbits; PMAP_ASSERT_STAGE1(pmap); KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } mask = nbits = 0; if ((prot & VM_PROT_WRITE) == 0) { mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); } if ((prot & VM_PROT_EXECUTE) == 0) { mask |= ATTR_S1_XN; nbits |= ATTR_S1_XN; } if (mask == 0) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G page " "l1 %#lx sva %#lx eva %#lx va_next %#lx", pmap_load(l1), sva, eva, va_next)); MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); if ((pmap_load(l1) & mask) != nbits) { pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); pmap_invalidate_page(pmap, sva); } continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_protect_l2(pmap, l2, sva, mask, nbits); continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) continue; } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_protect: Invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { l3 = pmap_load(l3p); /* * Go to the next L3 entry if the current one is * invalid or already has the desired access * restrictions in place. (The latter case occurs * frequently. For example, in a "buildworld" * workload, almost 1 out of 4 L3 entries already * have the desired restrictions.) */ if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits)) cpu_spinwait(); /* * When a dirty read/write mapping is write protected, * update the page's dirty field. */ if ((l3 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && pmap_pte_dirty(pmap, l3)) vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); if (va == va_next) va = sva; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Performs a break-before-make update of a pmap entry. This is needed when * either promoting or demoting pages to ensure the TLB doesn't get into an * inconsistent state. */ static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, vm_offset_t va, vm_size_t size) { register_t intr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Ensure we don't get switched out with the page table in an * inconsistent state. We also need to ensure no interrupts fire * as they may make use of an address we are about to invalidate. */ intr = intr_disable(); /* * Clear the old mapping's valid bit, but leave the rest of the entry * unchanged, so that a lockless, concurrent pmap_kextract() can still * lookup the physical address. */ pmap_clear_bits(pte, ATTR_DESCR_VALID); pmap_invalidate_range(pmap, va, va + size); /* Create the new mapping */ pmap_store(pte, newpte); dsb(ishst); intr_restore(intr); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_promote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = va & ~L2_OFFSET; pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); pvh = page_to_pvh(m); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single level 2 table entry to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3, newl2, oldl3, pa; vm_page_t mpte; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); sva = va & ~L2_OFFSET; firstl3 = pmap_l2_to_l3(l2, sva); newl2 = pmap_load(firstl3); if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } setl2: if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { /* * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, * ATTR_SW_DBM can be cleared without a TLB invalidation. */ if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM)) goto setl2; newl2 &= ~ATTR_SW_DBM; } pa = newl2 + L2_SIZE - PAGE_SIZE; for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { oldl3 = pmap_load(l3); setl3: if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { /* * When the mapping is clean, i.e., ATTR_S1_AP_RO is * set, ATTR_SW_DBM can be cleared without a TLB * invalidation. */ if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & ~ATTR_SW_DBM)) goto setl3; oldl3 &= ~ATTR_SW_DBM; } if (oldl3 != pa) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the L2 * mapping the superpage is demoted by pmap_demote_l2() or * destroyed by pmap_remove_l3(). */ mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_l2: page table page is out of range")); KASSERT(mpte->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx in pmap %p", va, pmap); return; } if ((newl2 & ATTR_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); newl2 &= ~ATTR_DESCR_MASK; newl2 |= L2_BLOCK; pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ static int pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, int psind) { pd_entry_t *l0p, *l1p, *l2p, origpte; vm_page_t mp; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(psind > 0 && psind < MAXPAGESIZES, ("psind %d unexpected", psind)); KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0, ("unaligned phys address %#lx newpte %#lx psind %d", (newpte & ~ATTR_MASK), newpte, psind)); restart: if (psind == 2) { l0p = pmap_l0(pmap, va); if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); if (mp == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); goto restart; } l1p = pmap_l0_to_l1(l0p, va); KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); origpte = pmap_load(l1p); } else { l1p = pmap_l0_to_l1(l0p, va); KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); origpte = pmap_load(l1p); if ((origpte & ATTR_DESCR_VALID) == 0) { mp = PHYS_TO_VM_PAGE(pmap_load(l0p) & ~ATTR_MASK); mp->ref_count++; } } KASSERT((origpte & ATTR_DESCR_VALID) == 0 || ((origpte & ATTR_DESCR_MASK) == L1_BLOCK && (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", va, origpte, newpte)); pmap_store(l1p, newpte); } else /* (psind == 1) */ { l2p = pmap_l2(pmap, va); if (l2p == NULL) { mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); if (mp == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); goto restart; } l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); l2p = &l2p[pmap_l2_index(va)]; origpte = pmap_load(l2p); } else { l1p = pmap_l1(pmap, va); origpte = pmap_load(l2p); if ((origpte & ATTR_DESCR_VALID) == 0) { mp = PHYS_TO_VM_PAGE(pmap_load(l1p) & ~ATTR_MASK); mp->ref_count++; } } KASSERT((origpte & ATTR_DESCR_VALID) == 0 || ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", va, origpte, newpte)); pmap_store(l2p, newpte); } dsb(ishst); if ((origpte & ATTR_DESCR_VALID) == 0) pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; else if ((newpte & ATTR_SW_WIRED) == 0 && (origpte & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; return (KERN_SUCCESS); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l2, *l3; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t nosleep; int lvl, rv; KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE); new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); new_l3 |= pmap_pte_prot(pmap, prot); if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= ATTR_SW_WIRED; if (pmap->pm_stage == PM_STAGE1) { if (!ADDR_IS_KERNEL(va)) new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; else new_l3 |= ATTR_S1_UXN; if (pmap != kernel_pmap) new_l3 |= ATTR_S1_nG; } else { /* * Clear the access flag on executable mappings, this will be * set later when the page is accessed. The fault handler is * required to invalidate the I-cache. * * TODO: Switch to the valid flag to allow hardware management * of the access flag. Much of the pmap code assumes the * valid flag is set and fails to destroy the old page tables * correctly if it is clear. */ if (prot & VM_PROT_EXECUTE) new_l3 &= ~ATTR_AF; } if ((m->oflags & VPO_UNMANAGED) == 0) { new_l3 |= ATTR_SW_MANAGED; if ((prot & VM_PROT_WRITE) != 0) { new_l3 |= ATTR_SW_DBM; if ((flags & VM_PROT_WRITE) == 0) { if (pmap->pm_stage == PM_STAGE1) new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); else new_l3 &= ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); } } } CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); lock = NULL; PMAP_LOCK(pmap); if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed largepage va %#lx flags %#x", va, flags)); new_l3 &= ~L3_PAGE; if (psind == 2) new_l3 |= L1_BLOCK; else /* (psind == 1) */ new_l3 |= L2_BLOCK; rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); goto out; } if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va, &lvl); if (pde != NULL && lvl == 2) { l3 = pmap_l2_to_l3(pde, va); if (!ADDR_IS_KERNEL(va) && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->ref_count++; } goto havel3; } else if (pde != NULL && lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { l3 = &l3[pmap_l3_index(va)]; if (!ADDR_IS_KERNEL(va)) { mpte = PHYS_TO_VM_PAGE( pmap_load(l2) & ~ATTR_MASK); mpte->ref_count++; } goto havel3; } /* We need to allocate an L3 table. */ } if (!ADDR_IS_KERNEL(va)) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; /* * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order * to handle the possibility that a superpage mapping for "va" * was created while we slept. */ mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: missing L3 table for kernel va %#lx", va); havel3: orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; pv = NULL; /* * Is the specified virtual address already mapped? */ if (pmap_l3_valid(orig_l3)) { /* * Only allow adding new entries on stage 2 tables for now. * This simplifies cache invalidation as we may need to call * into EL2 to perform such actions. */ PMAP_ASSERT_STAGE1(pmap); /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & ATTR_SW_MANAGED) != 0 && (new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. */ orig_l3 = pmap_load_clear(l3); KASSERT((orig_l3 & ~ATTR_MASK) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((orig_l3 & ATTR_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if (pmap_pte_dirty(pmap, orig_l3)) vm_page_dirty(om); if ((orig_l3 & ATTR_AF) != 0) { pmap_invalidate_page(pmap, va); vm_page_aflag_set(om, PGA_REFERENCED); } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); if ((m->oflags & VPO_UNMANAGED) != 0) free_pv_entry(pmap, pv); if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&page_to_pvh(om)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } else { KASSERT((orig_l3 & ATTR_AF) != 0, ("pmap_enter: unmanaged mapping lacks ATTR_AF")); pmap_invalidate_page(pmap, va); } orig_l3 = 0; } else { /* * Increment the counters. */ if ((new_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } validate: if (pmap->pm_stage == PM_STAGE1) { /* * Sync icache if exec permission and attribute * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping * is stored and made valid for hardware table walk. If done * later, then other can access this page before caches are * properly synced. Don't do it for kernel memory which is * mapped with exec permission even if the memory isn't going * to hold executable code. The only time when icache sync is * needed is after kernel module is loaded and the relocation * info is processed. And it's done in elf_cpu_load_file(). */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && (opa != pa || (orig_l3 & ATTR_S1_XN))) { PMAP_ASSERT_STAGE1(pmap); cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); } } else { cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE); } /* * Update the L3 entry */ if (pmap_l3_valid(orig_l3)) { PMAP_ASSERT_STAGE1(pmap); KASSERT(opa == pa, ("pmap_enter: invalid update")); if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { /* same PA, different attributes */ orig_l3 = pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); if ((orig_l3 & ATTR_SW_MANAGED) != 0 && pmap_pte_dirty(pmap, orig_l3)) vm_page_dirty(m); } else { /* * orig_l3 == new_l3 * This can happens if multiple threads simultaneously * access not yet mapped page. This bad for performance * since this can cause full demotion-NOP-promotion * cycle. * Another possible reasons are: * - VM and pmap memory layout are diverged * - tlb flush is missing somewhere and CPU doesn't see * actual mapping. */ CTR4(KTR_PMAP, "%s: already mapped page - " "pmap %p va 0x%#lx pte 0x%lx", __func__, pmap, va, new_l3); } } else { /* New mapping */ pmap_store(l3, new_l3); dsb(ishst); } #if VM_NRESERVLEVEL > 0 /* * Try to promote from level 3 pages to a level 2 superpage. This * currently only works on stage 1 pmaps as pmap_promote_l2 looks at * stage 1 specific fields and performs a break-before-make sequence * that is incorrect a stage 2 pmap. */ if ((mpte == NULL || mpte->ref_count == NL3PG) && pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_l2(pmap, pde, va, &lock); } #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t new_l2; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L2_BLOCK); if ((m->oflags & VPO_UNMANAGED) == 0) { new_l2 |= ATTR_SW_MANAGED; new_l2 &= ~ATTR_AF; } if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == VM_MEMATTR_DEVICE) new_l2 |= ATTR_S1_XN; if (!ADDR_IS_KERNEL(va)) new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; else new_l2 |= ATTR_S1_UXN; if (pmap != kernel_pmap) new_l2 |= ATTR_S1_nG; return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp) == KERN_SUCCESS); } /* * Returns true if every page table entry in the specified page table is * zero. */ static bool pmap_every_pte_zero(vm_paddr_t pa) { pt_entry_t *pt_end, *pte; KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); pte = (pt_entry_t *)PHYS_TO_DMAP(pa); for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { if (*pte != 0) return (false); } return (true); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. */ static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t *l2, old_l2; vm_page_t l2pg, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } /* * If there are existing mappings, either abort or remove them. */ if ((old_l2 = pmap_load(l2)) != 0) { KASSERT(l2pg == NULL || l2pg->ref_count > 1, ("pmap_enter_l2: l2pg's ref count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (!ADDR_IS_KERNEL(va) || (old_l2 & ATTR_DESCR_MASK) == L2_BLOCK || !pmap_every_pte_zero(old_l2 & ~ATTR_MASK))) { if (l2pg != NULL) l2pg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } SLIST_INIT(&free); if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); else pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, &free, lockp); if (!ADDR_IS_KERNEL(va)) { vm_page_free_pages_toq(&free, true); KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); } else { KASSERT(SLIST_EMPTY(&free), ("pmap_enter_l2: freed kernel page table page")); /* * Both pmap_remove_l2() and pmap_remove_l3_range() * will leave the kernel page table page zero filled. * Nonetheless, the TLB could have an intermediate * entry for the kernel page table page. */ mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); pmap_clear(l2); pmap_invalidate_page(pmap, va); } } if ((new_l2 & ATTR_SW_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { if (l2pg != NULL) pmap_abort_ptp(pmap, va, l2pg); CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((new_l2 & ATTR_SW_DBM) != 0) for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } /* * Increment counters. */ if ((new_l2 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; /* * Conditionally sync the icache. See pmap_enter() for details. */ if ((new_l2 & ATTR_S1_XN) == 0 && ((new_l2 & ~ATTR_MASK) != (old_l2 & ~ATTR_MASK) || (old_l2 & ATTR_S1_XN) != 0) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) { cpu_icache_sync_range(PHYS_TO_DMAP(new_l2 & ~ATTR_MASK), L2_SIZE); } /* * Map the superpage. */ pmap_store(l2, new_l2); dsb(ishst); atomic_add_long(&pmap_l2_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { pd_entry_t *pde; pt_entry_t *l2, *l3, l3_val; vm_paddr_t pa; int lvl; KASSERT(!VA_IS_CLEANMAP(va) || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (!ADDR_IS_KERNEL(va)) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->ref_count++; } else { /* * Get the l2 entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) return (NULL); } if (lvl == 2 && pmap_load(pde) != 0) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter_quick_locked: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } /* * Abort if a mapping already exists. */ if (pmap_load(l3) != 0) { if (mpte != NULL) mpte->ref_count--; return (NULL); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) pmap_abort_ptp(pmap, va, mpte); return (NULL); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m); l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == VM_MEMATTR_DEVICE) l3_val |= ATTR_S1_XN; if (!ADDR_IS_KERNEL(va)) l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; else l3_val |= ATTR_S1_UXN; if (pmap != kernel_pmap) l3_val |= ATTR_S1_nG; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) == 0) { l3_val |= ATTR_SW_MANAGED; l3_val &= ~ATTR_AF; } /* Sync icache before the mapping is stored to PTE */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); pmap_store(l3, l3_val); dsb(ishst); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G page " "l1 %#lx sva %#lx eva %#lx va_next %#lx", pmap_load(l1), sva, eva, va_next)); MPASS(pmap != kernel_pmap); MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | ATTR_SW_WIRED)) == ATTR_SW_WIRED); pmap_clear_bits(l1, ATTR_SW_WIRED); pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l2 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_clear_bits(l2, ATTR_SW_WIRED); pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) panic("pmap_unwire: demotion failed"); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_unwire: Invalid l2 entry after demotion")); if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) continue; if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); /* * ATTR_SW_WIRED must be cleared atomically. Although * the pmap lock synchronizes access to ATTR_SW_WIRED, * the System MMU may write to the entry concurrently. */ pmap_clear_bits(l3, ATTR_SW_WIRED); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. * * Because the executable mappings created by this routine are copied, * it should not have to flush the instruction cache. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; pd_entry_t *l0, *l1, *l2, srcptepaddr; pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_m, dstmpte, srcmpte; PMAP_ASSERT_STAGE1(dst_pmap); PMAP_ASSERT_STAGE1(src_pmap); if (dst_addr != src_addr) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } for (addr = src_addr; addr < end_addr; addr = va_next) { l0 = pmap_l0(src_pmap, addr); if (pmap_load(l0) == 0) { va_next = (addr + L0_SIZE) & ~L0_OFFSET; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + L1_SIZE) & ~L1_OFFSET; if (va_next < addr) va_next = end_addr; l1 = pmap_l0_to_l1(l0, addr); if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { KASSERT(va_next <= end_addr, ("partial update of non-transparent 1G page " "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", pmap_load(l1), addr, end_addr, va_next)); srcptepaddr = pmap_load(l1); l1 = pmap_l1(dst_pmap, addr); if (l1 == NULL) { if (_pmap_alloc_l3(dst_pmap, pmap_l0_pindex(addr), NULL) == NULL) break; l1 = pmap_l1(dst_pmap, addr); } else { l0 = pmap_l0(dst_pmap, addr); dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) & ~ATTR_MASK); dst_m->ref_count++; } KASSERT(pmap_load(l1) == 0, ("1G mapping present in dst pmap " "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", pmap_load(l1), addr, end_addr, va_next)); pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); continue; } va_next = (addr + L2_SIZE) & ~L2_OFFSET; if (va_next < addr) va_next = end_addr; l2 = pmap_l1_to_l2(l1, addr); srcptepaddr = pmap_load(l2); if (srcptepaddr == 0) continue; if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { /* * We can only virtual copy whole superpages. */ if ((addr & L2_OFFSET) != 0 || addr + L2_SIZE > end_addr) continue; l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); if (l2 == NULL) break; if (pmap_load(l2) == 0 && ((srcptepaddr & ATTR_SW_MANAGED) == 0 || pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { /* * We leave the dirty bit unchanged because * managed read/write superpage mappings are * required to be dirty. However, managed * superpage mappings are not required to * have their accessed bit set, so we clear * it because we don't know if this mapping * will be used. */ srcptepaddr &= ~ATTR_SW_WIRED; if ((srcptepaddr & ATTR_SW_MANAGED) != 0) srcptepaddr &= ~ATTR_AF; pmap_store(l2, srcptepaddr); pmap_resident_count_inc(dst_pmap, L2_SIZE / PAGE_SIZE); atomic_add_long(&pmap_l2_mappings, 1); } else pmap_abort_ptp(dst_pmap, addr, dst_m); continue; } KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_copy: invalid L2 entry")); srcptepaddr &= ~ATTR_MASK; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->ref_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_l3_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = pmap_load(src_pte); /* * We only virtual copy managed pages. */ if ((ptetemp & ATTR_SW_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->ref_count++; } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_l3_index(addr)]; if (pmap_load(dst_pte) == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((ptetemp & ATTR_SW_DBM) != 0) nbits |= ATTR_S1_AP_RW_BIT; pmap_store(dst_pte, (ptetemp & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, 1); } else { pmap_abort_ptp(dst_pmap, addr, dstmpte); goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->ref_count >= srcmpte->ref_count) break; } } out: /* * XXX This barrier may not be needed because the destination pmap is * not active. */ dsb(ishst); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, lvl, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Returns true if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns false. */ bool pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; bool rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (false); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); rw_runlock(lock); return (rv); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m, ml3, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx, lvl; vm_paddr_t pa; lock = NULL; SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("Attempting to remove an unmapped page")); switch(lvl) { case 1: pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("Attempting to remove an invalid " "block: %lx", tpte)); break; case 2: pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("Attempting to remove an invalid " "page: %lx", tpte)); break; default: panic( "Invalid page directory level: %d", lvl); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & ATTR_SW_WIRED) { allfree = 0; continue; } /* Mark free */ pc->pc_map[field] |= bitmask; /* * Because this pmap is not active on other * processors, the dirty bit cannot have * changed state since we last loaded pte. */ pmap_clear(pte); pa = tpte & ~ATTR_MASK; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); /* * Update the vm_page_t clean/reference bits. */ if (pmap_pte_dirty(pmap, tpte)) { switch (lvl) { case 1: for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); break; case 2: vm_page_dirty(m); break; } } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); switch (lvl) { case 1: pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); pvh = page_to_pvh(m); TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) if ((mt->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } ml3 = pmap_remove_pt_page(pmap, pv->pv_va); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: l3 page not promoted")); pmap_resident_count_dec(pmap,1); KASSERT(ml3->ref_count == NL3PG, ("pmap_remove_pages: l3 page ref count error")); ml3->ref_count = 0; pmap_add_delayed_free_list(ml3, &free, FALSE); } break; case 2: pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } break; } pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * This is used to check if a page has been accessed or modified. */ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask, value; pmap_t pmap; int lvl, md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 3, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_S1_AP_RW_BIT; value |= ATTR_S1_AP(ATTR_S1_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L3_PAGE; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 2, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_S1_AP_RW_BIT; value |= ATTR_S1_AP(ATTR_S1_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L2_BLOCK; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; int lvl; rv = FALSE; PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL && pmap_load(pte) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pt_entry_t oldpte, *pte; vm_offset_t va; int lvl, md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); rw_wlock(lock); retry: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pte = pmap_pte(pmap, va, &lvl); if ((pmap_load(pte) & ATTR_SW_DBM) != 0) (void)pmap_demote_l2_locked(pmap, pte, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); oldpte = pmap_load(pte); if ((oldpte & ATTR_SW_DBM) != 0) { while (!atomic_fcmpset_64(pte, &oldpte, (oldpte | ATTR_S1_AP_RW_BIT) & ~ATTR_SW_DBM)) cpu_spinwait(); if ((oldpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; vm_paddr_t pa; int cleared, lvl, md_gen, not_cleared, pvh_gen; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); KASSERT(lvl == 1, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, ("pmap_ts_referenced: found an invalid l1 table")); pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(pmap, tpte)) { /* * Although "tpte" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((tpte & ATTR_AF) != 0) { /* * Since this reference bit is shared by 512 4KB pages, * it should not be cleared every time it is tested. * Apply a simple "hash" function on the physical page * number, the virtual superpage number, and the pmap * address to select one 4KB page out of the 512 on * which testing the reference bit will result in * clearing that reference bit. This function is * designed to avoid the selection of the same 4KB page * for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); KASSERT(lvl == 2, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_ts_referenced: found an invalid l2 table")); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(pmap, tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { if ((tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; vm_offset_t va, va_next; vm_page_t m; pd_entry_t *l0, *l1, *l2, oldl2; pt_entry_t *l3, oldl3; PMAP_ASSERT_STAGE1(pmap); if (advice != MADV_DONTNEED && advice != MADV_FREE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G page " "l1 %#lx sva %#lx eva %#lx va_next %#lx", pmap_load(l1), sva, eva, va_next)); continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); oldl2 = pmap_load(l2); if (oldl2 == 0) continue; if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { if ((oldl2 & ATTR_SW_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The 2MB page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(va_next, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldl2 & ATTR_SW_WIRED) == 0) { va = eva; if (va > va_next) va = va_next; va -= PAGE_SIZE; KASSERT(va >= sva, ("pmap_advise: no address gap")); l3 = pmap_l2_to_l3(l2, va); KASSERT(pmap_load(l3) != 0, ("pmap_advise: invalid PTE")); pmap_remove_l3(pmap, l3, va, pmap_load(l2), NULL, &lock); } if (lock != NULL) rw_wunlock(lock); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_advise: invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { oldl3 = pmap_load(l3); if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != (ATTR_SW_MANAGED | L3_PAGE)) goto maybe_invlrng; else if (pmap_pte_dirty(pmap, oldl3)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); vm_page_dirty(m); } while (!atomic_fcmpset_long(l3, &oldl3, (oldl3 & ~ATTR_AF) | ATTR_S1_AP(ATTR_S1_AP_RO))) cpu_spinwait(); } else if ((oldl3 & ATTR_AF) != 0) pmap_clear_bits(l3, ATTR_AF); else goto maybe_invlrng; if (va == va_next) va = sva; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *l2, oldl2; pt_entry_t *l3, oldl3; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); oldl2 = pmap_load(l2); /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ if ((oldl2 & ATTR_SW_DBM) != 0 && pmap_demote_l2_locked(pmap, l2, va, &lock) && (oldl2 & ATTR_SW_WIRED) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); l3 = pmap_l2_to_l3(l2, va); oldl3 = pmap_load(l3); while (!atomic_fcmpset_long(l3, &oldl3, (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) cpu_spinwait(); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); l3 = pmap_l2_to_l3(l2, pv->pv_va); oldl3 = pmap_load(l3); if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){ pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, free_l2_count, start_idx; if (!vm_initialized) { /* * No L3 ptables so map entire L2 blocks where start VA is: * preinit_map_va + start_idx * L2_SIZE * There may be duplicate mappings (multiple VA -> same PA) but * ARM64 dcache is always PIPT so that's acceptable. */ if (size == 0) return (NULL); /* Calculate how many L2 blocks are needed for the mapping */ l2_blocks = (roundup2(pa + size, L2_SIZE) - rounddown2(pa, L2_SIZE)) >> L2_SHIFT; offset = pa & L2_OFFSET; if (preinit_map_va == 0) return (NULL); /* Map 2MiB L2 blocks from reserved VA space */ free_l2_count = 0; start_idx = -1; /* Find enough free contiguous VA space */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (free_l2_count > 0 && ppim->pa != 0) { /* Not enough space here */ free_l2_count = 0; start_idx = -1; continue; } if (ppim->pa == 0) { /* Free L2 block */ if (start_idx == -1) start_idx = i; free_l2_count++; if (free_l2_count == l2_blocks) break; } } if (free_l2_count != l2_blocks) panic("%s: too many preinit mappings", __func__); va = preinit_map_va + (start_idx * L2_SIZE); for (i = start_idx; i < start_idx + l2_blocks; i++) { /* Mark entries as allocated */ ppim = pmap_preinit_mapping + i; ppim->pa = pa; ppim->va = va + offset; ppim->size = size; } /* Map L2 blocks */ pa = rounddown2(pa, L2_SIZE); for (i = 0; i < l2_blocks; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_mapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 1, ("pmap_mapbios: Invalid level %d", lvl)); /* Insert L2_BLOCK */ l2 = pmap_l1_to_l2(pde, va); pmap_load_store(l2, pa | ATTR_DEFAULT | ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); va += L2_SIZE; pa += L2_SIZE; } pmap_invalidate_all(kernel_pmap); va = preinit_map_va + (start_idx * L2_SIZE); } else { /* kva_alloc may be used to map the pages */ offset = pa & PAGE_MASK; size = round_page(offset + size); va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); /* L3 table is linked */ va = trunc_page(va); pa = trunc_page(pa); pmap_kenter(va, size, pa, memory_mapping_mode(pa)); } return ((void *)(va + offset)); } void pmap_unmapbios(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset, tmpsize, va_trunc; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, block; bool preinit_map; l2_blocks = (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); /* Remove preinit mapping */ preinit_map = false; block = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va) { KASSERT(ppim->size == size, ("pmap_unmapbios: size mismatch")); ppim->va = 0; ppim->pa = 0; ppim->size = 0; preinit_map = true; offset = block * L2_SIZE; va_trunc = rounddown2(va, L2_SIZE) + offset; /* Remove L2_BLOCK */ pde = pmap_pde(kernel_pmap, va_trunc, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va_trunc)); l2 = pmap_l1_to_l2(pde, va_trunc); pmap_clear(l2); if (block == (l2_blocks - 1)) break; block++; } } if (preinit_map) { pmap_invalidate_all(kernel_pmap); return; } /* Unmap the pages reserved with kva_alloc. */ if (vm_initialized) { offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); /* Unmap and invalidate the pages */ for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kremove(va + tmpsize); kva_free(va, size); } } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pv_memattr) != 0) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_props_locked(va, size, PROT_NONE, mode); PMAP_UNLOCK(kernel_pmap); return (error); } /* * Changes the specified virtual address range's protections to those * specified by "prot". Like pmap_change_attr(), protections for aliases * in the direct map are updated as well. Protections on aliasing mappings may * be a subset of the requested protections; for example, mappings in the direct * map are never executable. */ int pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) { int error; /* Only supported within the kernel map. */ if (va < VM_MIN_KERNEL_ADDRESS) return (EINVAL); PMAP_LOCK(kernel_pmap); error = pmap_change_props_locked(va, size, prot, -1); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, int mode) { vm_offset_t base, offset, tmpva; vm_size_t pte_size; pt_entry_t pte, *ptep, *newpte; pt_entry_t bits, mask; int lvl, rv; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); if (!VIRT_IN_DMAP(base) && !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) return (EINVAL); bits = 0; mask = 0; if (mode != -1) { bits = ATTR_S1_IDX(mode); mask = ATTR_S1_IDX_MASK; if (mode == VM_MEMATTR_DEVICE) { mask |= ATTR_S1_XN; bits |= ATTR_S1_XN; } } if (prot != VM_PROT_NONE) { /* Don't mark the DMAP as executable. It never is on arm64. */ if (VIRT_IN_DMAP(base)) { prot &= ~VM_PROT_EXECUTE; /* * XXX Mark the DMAP as writable for now. We rely * on this in ddb & dtrace to insert breakpoint * instructions. */ prot |= VM_PROT_WRITE; } if ((prot & VM_PROT_WRITE) == 0) { bits |= ATTR_S1_AP(ATTR_S1_AP_RO); } if ((prot & VM_PROT_EXECUTE) == 0) { bits |= ATTR_S1_PXN; } bits |= ATTR_S1_UXN; mask |= ATTR_S1_AP_MASK | ATTR_S1_XN; } for (tmpva = base; tmpva < base + size; ) { ptep = pmap_pte(kernel_pmap, tmpva, &lvl); if (ptep == NULL) return (EINVAL); if ((pmap_load(ptep) & mask) == bits) { /* * We already have the correct attribute, * ignore this entry. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; break; case 2: tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; break; case 3: tmpva += PAGE_SIZE; break; } } else { /* * Split the entry to an level 3 table, then * set the new attribute. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: if ((tmpva & L1_OFFSET) == 0 && (base + size - tmpva) >= L1_SIZE) { pte_size = L1_SIZE; break; } newpte = pmap_demote_l1(kernel_pmap, ptep, tmpva & ~L1_OFFSET); if (newpte == NULL) return (EINVAL); ptep = pmap_l1_to_l2(ptep, tmpva); /* FALLTHROUGH */ case 2: if ((tmpva & L2_OFFSET) == 0 && (base + size - tmpva) >= L2_SIZE) { pte_size = L2_SIZE; break; } newpte = pmap_demote_l2(kernel_pmap, ptep, tmpva); if (newpte == NULL) return (EINVAL); ptep = pmap_l2_to_l3(ptep, tmpva); /* FALLTHROUGH */ case 3: pte_size = PAGE_SIZE; break; } /* Update the entry */ pte = pmap_load(ptep); pte &= ~mask; pte |= bits; pmap_update_entry(kernel_pmap, ptep, pte, tmpva, pte_size); if (!VIRT_IN_DMAP(tmpva)) { /* * Keep the DMAP memory in sync. */ rv = pmap_change_props_locked( PHYS_TO_DMAP(pte & ~ATTR_MASK), pte_size, prot, mode); if (rv != 0) return (rv); } /* * If moving to a non-cacheable entry flush * the cache. */ if (mode == VM_MEMATTR_UNCACHEABLE) cpu_dcache_wbinv_range(tmpva, pte_size); tmpva += pte_size; } } return (0); } /* * Create an L2 table to map all addresses within an L1 mapping. */ static pt_entry_t * pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) { pt_entry_t *l2, newl2, oldl1; vm_offset_t tmpl1; vm_paddr_t l2phys, phys; vm_page_t ml2; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl1 = pmap_load(l1); KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_demote_l1: Demoting a non-block entry")); KASSERT((va & L1_OFFSET) == 0, ("pmap_demote_l1: Invalid virtual address %#lx", va)); KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, ("pmap_demote_l1: Level 1 table shouldn't be managed")); tmpl1 = 0; if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { tmpl1 = kva_alloc(PAGE_SIZE); if (tmpl1 == 0) return (NULL); } - if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) == + NULL) { CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" " in pmap %p", va, pmap); l2 = NULL; goto fail; } l2phys = VM_PAGE_TO_PHYS(ml2); l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); /* Address the range points at */ phys = oldl1 & ~ATTR_MASK; /* The attributed from the old l1 table to be copied */ newl2 = oldl1 & ATTR_MASK; /* Create the new entries */ for (i = 0; i < Ln_ENTRIES; i++) { l2[i] = newl2 | phys; phys += L2_SIZE; } KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); if (tmpl1 != 0) { pmap_kenter(tmpl1, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, VM_MEMATTR_WRITE_BACK); l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); } pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); fail: if (tmpl1 != 0) { pmap_kremove(tmpl1); kva_free(tmpl1, PAGE_SIZE); } return (l2); } static void pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) { pt_entry_t *l3; for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { *l3 = newl3; newl3 += L3_SIZE; } } static void pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, struct rwlock **lockp) { struct spglist free; SLIST_INIT(&free); (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); vm_page_free_pages_toq(&free, true); } /* * Create an L3 table to map all addresses within an L2 mapping. */ static pt_entry_t * pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *l3, newl3, oldl2; vm_offset_t tmpl2; vm_paddr_t l3phys; vm_page_t ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); l3 = NULL; oldl2 = pmap_load(l2); KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_demote_l2: Demoting a non-block entry")); va &= ~L2_OFFSET; tmpl2 = 0; if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { tmpl2 = kva_alloc(PAGE_SIZE); if (tmpl2 == 0) return (NULL); } /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldl2 & ATTR_AF) == 0) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", va, pmap); goto fail; } if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * either the direct map or the early kernel memory. * Page table pages are preallocated for every other * part of the kernel address space, so the direct map * region and early kernel memory are the only parts of the * kernel address space that must be handled here. */ KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) || (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end), ("pmap_demote_l2: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ - ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), - (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); + ml3 = vm_page_alloc_noobj( + (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | + VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (ml3 == NULL) { pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } + ml3->pindex = pmap_l2_pindex(va); if (!ADDR_IS_KERNEL(va)) { ml3->ref_count = NL3PG; pmap_resident_count_inc(pmap, 1); } } l3phys = VM_PAGE_TO_PHYS(ml3); l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), ("pmap_demote_l2: L2 entry is writeable but not dirty")); /* * If the page table page is not leftover from an earlier promotion, * or the mapping attributes have changed, (re)initialize the L3 table. * * When pmap_update_entry() clears the old L2 mapping, it (indirectly) * performs a dsb(). That dsb() ensures that the stores for filling * "l3" are visible before "l3" is added to the page table. */ if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) pmap_fill_l3(l3, newl3); /* * Map the temporary page so we don't lose access to the l2 table. */ if (tmpl2 != 0) { pmap_kenter(tmpl2, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, VM_MEMATTR_WRITE_BACK); l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); } /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the L2 and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_l2() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); /* * Pass PAGE_SIZE so that a single TLB invalidation is performed on * the 2MB page mapping. */ pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); /* * Demote the PV entry. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" " in pmap %p %lx", va, pmap, l3[0]); fail: if (tmpl2 != 0) { pmap_kremove(tmpl2); kva_free(tmpl2, PAGE_SIZE); } return (l3); } static pt_entry_t * pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { struct rwlock *lock; pt_entry_t *l3; lock = NULL; l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (l3); } /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can * find other mappings. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { pt_entry_t *pte, tpte; vm_paddr_t mask, pa; int lvl, val; bool managed; PMAP_ASSERT_STAGE1(pmap); PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL) { tpte = pmap_load(pte); switch (lvl) { case 3: mask = L3_OFFSET; break; case 2: mask = L2_OFFSET; break; case 1: mask = L1_OFFSET; break; default: panic("pmap_mincore: invalid level %d", lvl); } managed = (tpte & ATTR_SW_MANAGED) != 0; val = MINCORE_INCORE; if (lvl != 3) val |= MINCORE_PSIND(3 - lvl); if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((tpte & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; pa = (tpte & ~ATTR_MASK) | (addr & mask); } else { managed = false; val = 0; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { *pap = pa; } PMAP_UNLOCK(pmap); return (val); } /* * Garbage collect every ASID that is neither active on a processor nor * reserved. */ static void pmap_reset_asid_set(pmap_t pmap) { pmap_t curpmap; int asid, cpuid, epoch; struct asid_set *set; enum pmap_stage stage; set = pmap->pm_asid_set; stage = pmap->pm_stage; set = pmap->pm_asid_set; KASSERT(set != NULL, ("%s: NULL asid set", __func__)); mtx_assert(&set->asid_set_mutex, MA_OWNED); /* * Ensure that the store to asid_epoch is globally visible before the * loads from pc_curpmap are performed. */ epoch = set->asid_epoch + 1; if (epoch == INT_MAX) epoch = 0; set->asid_epoch = epoch; dsb(ishst); if (stage == PM_STAGE1) { __asm __volatile("tlbi vmalle1is"); } else { KASSERT(pmap_clean_stage2_tlbi != NULL, ("%s: Unset stage 2 tlb invalidation callback\n", __func__)); pmap_clean_stage2_tlbi(); } dsb(ish); bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, set->asid_set_size - 1); CPU_FOREACH(cpuid) { if (cpuid == curcpu) continue; if (stage == PM_STAGE1) { curpmap = pcpu_find(cpuid)->pc_curpmap; PMAP_ASSERT_STAGE1(pmap); } else { curpmap = pcpu_find(cpuid)->pc_curvmpmap; if (curpmap == NULL) continue; PMAP_ASSERT_STAGE2(pmap); } KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); asid = COOKIE_TO_ASID(curpmap->pm_cookie); if (asid == -1) continue; bit_set(set->asid_set, asid); curpmap->pm_cookie = COOKIE_FROM(asid, epoch); } } /* * Allocate a new ASID for the specified pmap. */ static void pmap_alloc_asid(pmap_t pmap) { struct asid_set *set; int new_asid; set = pmap->pm_asid_set; KASSERT(set != NULL, ("%s: NULL asid set", __func__)); mtx_lock_spin(&set->asid_set_mutex); /* * While this processor was waiting to acquire the asid set mutex, * pmap_reset_asid_set() running on another processor might have * updated this pmap's cookie to the current epoch. In which case, we * don't need to allocate a new ASID. */ if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) goto out; bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, &new_asid); if (new_asid == -1) { bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, set->asid_next, &new_asid); if (new_asid == -1) { pmap_reset_asid_set(pmap); bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, set->asid_set_size, &new_asid); KASSERT(new_asid != -1, ("ASID allocation failure")); } } bit_set(set->asid_set, new_asid); set->asid_next = new_asid + 1; pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); out: mtx_unlock_spin(&set->asid_set_mutex); } /* * Compute the value that should be stored in ttbr0 to activate the specified * pmap. This value may change from time to time. */ uint64_t pmap_to_ttbr0(pmap_t pmap) { return (ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | pmap->pm_ttbr); } static bool pmap_activate_int(pmap_t pmap) { struct asid_set *set; int epoch; KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { /* * Handle the possibility that the old thread was preempted * after an "ic" or "tlbi" instruction but before it performed * a "dsb" instruction. If the old thread migrates to a new * processor, its completion of a "dsb" instruction on that * new processor does not guarantee that the "ic" or "tlbi" * instructions performed on the old processor have completed. */ dsb(ish); return (false); } set = pmap->pm_asid_set; KASSERT(set != NULL, ("%s: NULL asid set", __func__)); /* * Ensure that the store to curpmap is globally visible before the * load from asid_epoch is performed. */ if (pmap->pm_stage == PM_STAGE1) PCPU_SET(curpmap, pmap); else PCPU_SET(curvmpmap, pmap); dsb(ish); epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); if (epoch >= 0 && epoch != set->asid_epoch) pmap_alloc_asid(pmap); if (pmap->pm_stage == PM_STAGE1) { set_ttbr0(pmap_to_ttbr0(pmap)); if (PCPU_GET(bcast_tlbi_workaround) != 0) invalidate_local_icache(); } return (true); } void pmap_activate_vm(pmap_t pmap) { PMAP_ASSERT_STAGE2(pmap); (void)pmap_activate_int(pmap); } void pmap_activate(struct thread *td) { pmap_t pmap; pmap = vmspace_pmap(td->td_proc->p_vmspace); PMAP_ASSERT_STAGE1(pmap); critical_enter(); (void)pmap_activate_int(pmap); critical_exit(); } /* * To eliminate the unused parameter "old", we would have to add an instruction * to cpu_switch(). */ struct pcb * pmap_switch(struct thread *old __unused, struct thread *new) { pcpu_bp_harden bp_harden; struct pcb *pcb; /* Store the new curthread */ PCPU_SET(curthread, new); /* And the new pcb */ pcb = new->td_pcb; PCPU_SET(curpcb, pcb); /* * TODO: We may need to flush the cache here if switching * to a user process. */ if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) { /* * Stop userspace from training the branch predictor against * other processes. This will call into a CPU specific * function that clears the branch predictor state. */ bp_harden = PCPU_GET(bp_harden); if (bp_harden != NULL) bp_harden(); } return (pcb); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { PMAP_ASSERT_STAGE1(pmap); KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); if (ADDR_IS_KERNEL(va)) { cpu_icache_sync_range(va, sz); } else { u_int len, offset; vm_paddr_t pa; /* Find the length of data in this page to flush */ offset = va & PAGE_MASK; len = imin(PAGE_SIZE - offset, sz); while (sz != 0) { /* Extract the physical address & find it in the DMAP */ pa = pmap_extract(pmap, va); if (pa != 0) cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); /* Move to the next page */ sz -= len; va += len; /* Set the length for the next iteration */ len = imin(PAGE_SIZE, sz); } } } static int pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) { pd_entry_t *pdep; pt_entry_t *ptep, pte; int rv, lvl, dfsc; PMAP_ASSERT_STAGE2(pmap); rv = KERN_FAILURE; /* Data and insn aborts use same encoding for FSC field. */ dfsc = esr & ISS_DATA_DFSC_MASK; switch (dfsc) { case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: PMAP_LOCK(pmap); pdep = pmap_pde(pmap, far, &lvl); if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { PMAP_LOCK(pmap); break; } switch (lvl) { case 0: ptep = pmap_l0_to_l1(pdep, far); break; case 1: ptep = pmap_l1_to_l2(pdep, far); break; case 2: ptep = pmap_l2_to_l3(pdep, far); break; default: panic("%s: Invalid pde level %d", __func__,lvl); } goto fault_exec; case ISS_DATA_DFSC_AFF_L1: case ISS_DATA_DFSC_AFF_L2: case ISS_DATA_DFSC_AFF_L3: PMAP_LOCK(pmap); ptep = pmap_pte(pmap, far, &lvl); fault_exec: if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { if (icache_vmid) { pmap_invalidate_vpipt_icache(); } else { /* * If accessing an executable page invalidate * the I-cache so it will be valid when we * continue execution in the guest. The D-cache * is assumed to already be clean to the Point * of Coherency. */ if ((pte & ATTR_S2_XN_MASK) != ATTR_S2_XN(ATTR_S2_XN_NONE)) { invalidate_icache(); } } pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); rv = KERN_SUCCESS; } PMAP_UNLOCK(pmap); break; } return (rv); } int pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) { pt_entry_t pte, *ptep; register_t intr; uint64_t ec, par; int lvl, rv; rv = KERN_FAILURE; ec = ESR_ELx_EXCEPTION(esr); switch (ec) { case EXCP_INSN_ABORT_L: case EXCP_INSN_ABORT: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: break; default: return (rv); } if (pmap->pm_stage == PM_STAGE2) return (pmap_stage2_fault(pmap, esr, far)); /* Data and insn aborts use same encoding for FSC field. */ switch (esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_AFF_L1: case ISS_DATA_DFSC_AFF_L2: case ISS_DATA_DFSC_AFF_L3: PMAP_LOCK(pmap); ptep = pmap_pte(pmap, far, &lvl); if (ptep != NULL) { pmap_set_bits(ptep, ATTR_AF); rv = KERN_SUCCESS; /* * XXXMJ as an optimization we could mark the entry * dirty if this is a write fault. */ } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_PF_L1: case ISS_DATA_DFSC_PF_L2: case ISS_DATA_DFSC_PF_L3: if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || (esr & ISS_DATA_WnR) == 0) return (rv); PMAP_LOCK(pmap); ptep = pmap_pte(pmap, far, &lvl); if (ptep != NULL && ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { if ((pte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RO)) { pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); pmap_invalidate_page(pmap, far); } rv = KERN_SUCCESS; } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: /* * Retry the translation. A break-before-make sequence can * produce a transient fault. */ if (pmap == kernel_pmap) { /* * The translation fault may have occurred within a * critical section. Therefore, we must check the * address without acquiring the kernel pmap's lock. */ if (pmap_klookup(far, NULL)) rv = KERN_SUCCESS; } else { PMAP_LOCK(pmap); /* Ask the MMU to check the address. */ intr = intr_disable(); par = arm64_address_translate_s1e0r(far); intr_restore(intr); PMAP_UNLOCK(pmap); /* * If the translation was successful, then we can * return success to the trap handler. */ if (PAR_SUCCESS(par)) rv = KERN_SUCCESS; } break; } return (rv); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(!PHYS_IN_DMAP(paddr))) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); } } } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); } /* * Track a range of the kernel's virtual address space that is contiguous * in various mapping attributes. */ struct pmap_kernel_map_range { vm_offset_t sva; pt_entry_t attrs; int l3pages; int l3contig; int l2blocks; int l1blocks; }; static void sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t eva) { const char *mode; int index; if (eva <= range->sva) return; index = range->attrs & ATTR_S1_IDX_MASK; switch (index) { case ATTR_S1_IDX(VM_MEMATTR_DEVICE): mode = "DEV"; break; case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): mode = "UC"; break; case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): mode = "WB"; break; case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): mode = "WT"; break; default: printf( "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", __func__, index, range->sva, eva); mode = "??"; break; } sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c %3s %d %d %d %d\n", range->sva, eva, (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', (range->attrs & ATTR_S1_AP_USER) != 0 ? 'u' : 's', mode, range->l1blocks, range->l2blocks, range->l3contig, range->l3pages); /* Reset to sentinel value. */ range->sva = 0xfffffffffffffffful; } /* * Determine whether the attributes specified by a page table entry match those * being tracked by the current range. */ static bool sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) { return (range->attrs == attrs); } static void sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, pt_entry_t attrs) { memset(range, 0, sizeof(*range)); range->sva = va; range->attrs = attrs; } /* * Given a leaf PTE, derive the mapping's attributes. If they do not match * those of the current run, dump the address range and its attributes, and * begin a new run. */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) { pt_entry_t attrs; attrs = l0e & (ATTR_S1_AP_MASK | ATTR_S1_XN); attrs |= l1e & (ATTR_S1_AP_MASK | ATTR_S1_XN); if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) attrs |= l1e & ATTR_S1_IDX_MASK; attrs |= l2e & (ATTR_S1_AP_MASK | ATTR_S1_XN); if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) attrs |= l2e & ATTR_S1_IDX_MASK; attrs |= l3e & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK); if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { sysctl_kmaps_dump(sb, range, va); sysctl_kmaps_reinit(range, va, attrs); } } static int sysctl_kmaps(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; pd_entry_t l0e, *l1, l1e, *l2, l2e; pt_entry_t *l3, l3e; vm_offset_t sva; vm_paddr_t pa; int error, i, j, k, l; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = &sbuf; sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ range.sva = 0xfffffffffffffffful; /* * Iterate over the kernel page tables without holding the kernel pmap * lock. Kernel page table pages are never freed, so at worst we will * observe inconsistencies in the output. */ for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; i++) { if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) sbuf_printf(sb, "\nDirect map:\n"); else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) sbuf_printf(sb, "\nKernel map:\n"); l0e = kernel_pmap->pm_l0[i]; if ((l0e & ATTR_DESCR_VALID) == 0) { sysctl_kmaps_dump(sb, &range, sva); sva += L0_SIZE; continue; } pa = l0e & ~ATTR_MASK; l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { l1e = l1[j]; if ((l1e & ATTR_DESCR_VALID) == 0) { sysctl_kmaps_dump(sb, &range, sva); sva += L1_SIZE; continue; } if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 0, 0); range.l1blocks++; sva += L1_SIZE; continue; } pa = l1e & ~ATTR_MASK; l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { l2e = l2[k]; if ((l2e & ATTR_DESCR_VALID) == 0) { sysctl_kmaps_dump(sb, &range, sva); sva += L2_SIZE; continue; } if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { sysctl_kmaps_check(sb, &range, sva, l0e, l1e, l2e, 0); range.l2blocks++; sva += L2_SIZE; continue; } pa = l2e & ~ATTR_MASK; l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); for (l = pmap_l3_index(sva); l < Ln_ENTRIES; l++, sva += L3_SIZE) { l3e = l3[l]; if ((l3e & ATTR_DESCR_VALID) == 0) { sysctl_kmaps_dump(sb, &range, sva); continue; } sysctl_kmaps_check(sb, &range, sva, l0e, l1e, l2e, l3e); if ((l3e & ATTR_CONTIGUOUS) != 0) range.l3contig += l % 16 == 0 ? 1 : 0; else range.l3pages++; } } } } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, NULL, 0, sysctl_kmaps, "A", "Dump kernel address layout"); diff --git a/sys/arm64/arm64/uma_machdep.c b/sys/arm64/arm64/uma_machdep.c index 3ef3dd3cc9e9..d1c656a3b9f1 100644 --- a/sys/arm64/arm64/uma_machdep.c +++ b/sys/arm64/arm64/uma_machdep.c @@ -1,74 +1,72 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; - m = vm_page_alloc_domain(NULL, 0, domain, - malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); + m = vm_page_alloc_noobj_domain(domain, malloc2vm_flags(wait) | + VM_ALLOC_WIRED); if (m == NULL) return (NULL); pa = m->phys_addr; if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)PHYS_TO_DMAP(pa); - if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) - pagezero(va); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = DMAP_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); vm_page_unwire_noq(m); vm_page_free(m); } diff --git a/sys/arm64/iommu/iommu_pmap.c b/sys/arm64/iommu/iommu_pmap.c index 354c213569ff..7ea8a7ee93a3 100644 --- a/sys/arm64/iommu/iommu_pmap.c +++ b/sys/arm64/iommu/iommu_pmap.c @@ -1,895 +1,888 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2020-2021 Ruslan Bukin * Copyright (c) 2014-2021 Andrew Turner * Copyright (c) 2014-2016 The FreeBSD Foundation * All rights reserved. * * This work was supported by Innovate UK project 105694, "Digital Security * by Design (DSbD) Technology Platform Prototype". * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps for ARM SMMUv3 and ARM Mali GPU. */ #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IOMMU_PAGE_SIZE 4096 #define NL0PG (IOMMU_PAGE_SIZE/(sizeof (pd_entry_t))) #define NL1PG (IOMMU_PAGE_SIZE/(sizeof (pd_entry_t))) #define NL2PG (IOMMU_PAGE_SIZE/(sizeof (pd_entry_t))) #define NL3PG (IOMMU_PAGE_SIZE/(sizeof (pt_entry_t))) #define NUL0E IOMMU_L0_ENTRIES #define NUL1E (NUL0E * NL1PG) #define NUL2E (NUL1E * NL2PG) #define iommu_l0_pindex(v) (NUL2E + NUL1E + ((v) >> IOMMU_L0_SHIFT)) #define iommu_l1_pindex(v) (NUL2E + ((v) >> IOMMU_L1_SHIFT)) #define iommu_l2_pindex(v) ((v) >> IOMMU_L2_SHIFT) /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~IOMMU_L0_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~IOMMU_L0_OFFSET) == DMAP_MAX_ADDRESS); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_load(table) (*table) #define pmap_clear(table) atomic_store_64(table, 0) #define pmap_store(table, entry) atomic_store_64(table, entry) /********************/ /* Inline functions */ /********************/ static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l0[iommu_l0_index(va)]); } static __inline pd_entry_t * pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) { pd_entry_t *l1; l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); return (&l1[iommu_l1_index(va)]); } static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { pd_entry_t *l0; l0 = pmap_l0(pmap, va); if ((pmap_load(l0) & ATTR_DESCR_MASK) != IOMMU_L0_TABLE) return (NULL); return (pmap_l0_to_l1(l0, va)); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) { pd_entry_t l1, *l2p; l1 = pmap_load(l1p); /* * The valid bit may be clear if pmap_update_entry() is concurrently * modifying the entry, so for KVA only the entry type may be checked. */ KASSERT(va >= VM_MAX_USER_ADDRESS || (l1 & ATTR_DESCR_VALID) != 0, ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK); return (&l2p[iommu_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & ATTR_DESCR_MASK) != IOMMU_L1_TABLE) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) { pd_entry_t l2; pt_entry_t *l3p; l2 = pmap_load(l2p); /* * The valid bit may be clear if pmap_update_entry() is concurrently * modifying the entry, so for KVA only the entry type may be checked. */ KASSERT(va >= VM_MAX_USER_ADDRESS || (l2 & ATTR_DESCR_VALID) != 0, ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK); return (&l3p[iommu_l3_index(va)]); } /* * Returns the lowest valid pde for a given virtual address. * The next level may or may not point to a valid page or block. */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l0, *l1, *l2, desc; l0 = pmap_l0(pmap, va); desc = pmap_load(l0) & ATTR_DESCR_MASK; if (desc != IOMMU_L0_TABLE) { *level = -1; return (NULL); } l1 = pmap_l0_to_l1(l0, va); desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc != IOMMU_L1_TABLE) { *level = 0; return (l0); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc != IOMMU_L2_TABLE) { *level = 1; return (l1); } *level = 2; return (l2); } /* * Returns the lowest valid pte block or table entry for a given virtual * address. If there are no valid entries return NULL and set the level to * the first invalid level. */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l1, *l2, desc; pt_entry_t *l3; l1 = pmap_l1(pmap, va); if (l1 == NULL) { *level = 0; return (NULL); } desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc == IOMMU_L1_BLOCK) { *level = 1; return (l1); } if (desc != IOMMU_L1_TABLE) { *level = 1; return (NULL); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc == IOMMU_L2_BLOCK) { *level = 2; return (l2); } if (desc != IOMMU_L2_TABLE) { *level = 2; return (NULL); } *level = 3; l3 = pmap_l2_to_l3(l2, va); if ((pmap_load(l3) & ATTR_DESCR_MASK) != IOMMU_L3_PAGE) return (NULL); return (l3); } static __inline int pmap_l3_valid(pt_entry_t l3) { return ((l3 & ATTR_DESCR_MASK) == IOMMU_L3_PAGE); } CTASSERT(IOMMU_L1_BLOCK == IOMMU_L2_BLOCK); static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ pd_entry_t *l0; l0 = pmap_l0(pmap, va); pmap_clear(l0); } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); } else { /* l3 page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { /* We just released an l3, unhold the matching l2 */ pd_entry_t *l1, tl1; vm_page_t l2pg; l1 = pmap_l1(pmap, va); tl1 = pmap_load(l1); l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ pd_entry_t *l0, tl0; vm_page_t l1pg; l0 = pmap_l0(pmap, va); tl0 = pmap_load(l0); l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l1pg, free); } /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } static int iommu_pmap_pinit_levels(pmap_t pmap, int levels) { vm_page_t m; /* * allocate the l0 page */ - while ((m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) - vm_wait(NULL); - + m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); - if ((m->flags & PG_ZERO) == 0) - pagezero(pmap->pm_l0); - pmap->pm_root.rt_root = 0; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); MPASS(levels == 3 || levels == 4); pmap->pm_levels = levels; /* * Allocate the level 1 entry to use as the root. This will increase * the refcount on the level 1 page so it won't be removed until * pmap_release() is called. */ if (pmap->pm_levels == 3) { PMAP_LOCK(pmap); m = _pmap_alloc_l3(pmap, NUL2E + NUL1E); PMAP_UNLOCK(pmap); } pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); return (1); } int iommu_pmap_pinit(pmap_t pmap) { return (iommu_pmap_pinit_levels(pmap, 4)); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex) { vm_page_t m, l1pg, l2pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ - if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(m); + m->pindex = ptepindex; /* * Because of AArch64's weak memory consistency model, we must have a * barrier here to ensure that the stores for zeroing "m", whether by * pmap_zero_page() or an earlier function, are visible before adding * "m" to the page table. Otherwise, a page table walk by another * processor's MMU could see the mapping to "m" and a stale, non-zero * PTE within "m". */ dmb(ishst); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUL2E + NUL1E)) { pd_entry_t *l0; vm_pindex_t l0index; l0index = ptepindex - (NUL2E + NUL1E); l0 = &pmap->pm_l0[l0index]; pmap_store(l0, VM_PAGE_TO_PHYS(m) | IOMMU_L0_TABLE); } else if (ptepindex >= NUL2E) { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1; pd_entry_t tl0; l1index = ptepindex - NUL2E; l0index = l1index >> IOMMU_L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); l1pg->ref_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); l1 = &l1[ptepindex & Ln_ADDR_MASK]; pmap_store(l1, VM_PAGE_TO_PHYS(m) | IOMMU_L1_TABLE); } else { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1, *l2; pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; l0index = l1index >> IOMMU_L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } tl0 = pmap_load(l0); l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; } else { l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); l2pg->ref_count++; } } l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pmap_store(l2, VM_PAGE_TO_PHYS(m) | IOMMU_L2_TABLE); } pmap_resident_count_inc(pmap, 1); return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void iommu_pmap_release(pmap_t pmap) { boolean_t rv; struct spglist free; vm_page_t m; if (pmap->pm_levels != 4) { KASSERT(pmap->pm_stats.resident_count == 1, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); SLIST_INIT(&free); m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); PMAP_LOCK(pmap); rv = pmap_unwire_l3(pmap, 0, m, &free); PMAP_UNLOCK(pmap); MPASS(rv == TRUE); vm_page_free_pages_toq(&free, true); } KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); vm_page_unwire_noq(m); vm_page_free_zero(m); } /*************************************************** * page management routines. ***************************************************/ /* * Add a single Mali GPU entry. This function does not sleep. */ int pmap_gpu_enter(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_prot_t prot, u_int flags) { pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l3; vm_page_t mpte; pd_entry_t *l1p; pd_entry_t *l2p; int lvl; int rv; KASSERT(pmap != kernel_pmap, ("kernel pmap used for GPU")); KASSERT(va < VM_MAXUSER_ADDRESS, ("wrong address space")); KASSERT((va & PAGE_MASK) == 0, ("va is misaligned")); KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); new_l3 = (pt_entry_t)(pa | ATTR_SH(ATTR_SH_IS) | IOMMU_L3_BLOCK); if ((prot & VM_PROT_WRITE) != 0) new_l3 |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); if ((prot & VM_PROT_READ) != 0) new_l3 |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); if ((prot & VM_PROT_EXECUTE) == 0) new_l3 |= ATTR_S2_XN(ATTR_S2_XN_ALL); CTR2(KTR_PMAP, "pmap_gpu_enter: %.16lx -> %.16lx", va, pa); PMAP_LOCK(pmap); /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va, &lvl); if (pde != NULL && lvl == 2) { l3 = pmap_l2_to_l3(pde, va); } else { mpte = _pmap_alloc_l3(pmap, iommu_l2_pindex(va)); if (mpte == NULL) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); rv = KERN_RESOURCE_SHORTAGE; goto out; } /* * Ensure newly created l1, l2 are visible to GPU. * l0 is already visible by similar call in panfrost driver. * The cache entry for l3 handled below. */ l1p = pmap_l1(pmap, va); l2p = pmap_l2(pmap, va); cpu_dcache_wb_range((vm_offset_t)l1p, sizeof(pd_entry_t)); cpu_dcache_wb_range((vm_offset_t)l2p, sizeof(pd_entry_t)); goto retry; } orig_l3 = pmap_load(l3); KASSERT(!pmap_l3_valid(orig_l3), ("l3 is valid")); /* New mapping */ pmap_store(l3, new_l3); cpu_dcache_wb_range((vm_offset_t)l3, sizeof(pt_entry_t)); pmap_resident_count_inc(pmap, 1); dsb(ishst); rv = KERN_SUCCESS; out: PMAP_UNLOCK(pmap); return (rv); } /* * Remove a single Mali GPU entry. */ int pmap_gpu_remove(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t *pte; int lvl; int rc; KASSERT((va & PAGE_MASK) == 0, ("va is misaligned")); KASSERT(pmap != kernel_pmap, ("kernel pmap used for GPU")); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va, &lvl); if (pde == NULL || lvl != 2) { rc = KERN_FAILURE; goto out; } pte = pmap_l2_to_l3(pde, va); pmap_resident_count_dec(pmap, 1); pmap_clear(pte); cpu_dcache_wb_range((vm_offset_t)pte, sizeof(pt_entry_t)); rc = KERN_SUCCESS; out: PMAP_UNLOCK(pmap); return (rc); } /* * Add a single SMMU entry. This function does not sleep. */ int pmap_smmu_enter(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_prot_t prot, u_int flags) { pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l3; vm_page_t mpte; int lvl; int rv; KASSERT(va < VM_MAXUSER_ADDRESS, ("wrong address space")); va = trunc_page(va); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_S1_IDX(VM_MEMATTR_DEVICE) | IOMMU_L3_PAGE); if ((prot & VM_PROT_WRITE) == 0) new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); new_l3 |= ATTR_S1_XN; /* Execute never. */ new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER); new_l3 |= ATTR_S1_nG; /* Non global. */ CTR2(KTR_PMAP, "pmap_senter: %.16lx -> %.16lx", va, pa); PMAP_LOCK(pmap); /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va, &lvl); if (pde != NULL && lvl == 2) { l3 = pmap_l2_to_l3(pde, va); } else { mpte = _pmap_alloc_l3(pmap, iommu_l2_pindex(va)); if (mpte == NULL) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } orig_l3 = pmap_load(l3); KASSERT(!pmap_l3_valid(orig_l3), ("l3 is valid")); /* New mapping */ pmap_store(l3, new_l3); pmap_resident_count_inc(pmap, 1); dsb(ishst); rv = KERN_SUCCESS; out: PMAP_UNLOCK(pmap); return (rv); } /* * Remove a single SMMU entry. */ int pmap_smmu_remove(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; int lvl; int rc; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va, &lvl); KASSERT(lvl == 3, ("Invalid SMMU pagetable level: %d != 3", lvl)); if (pte != NULL) { pmap_resident_count_dec(pmap, 1); pmap_clear(pte); rc = KERN_SUCCESS; } else rc = KERN_FAILURE; PMAP_UNLOCK(pmap); return (rc); } /* * Remove all the allocated L1, L2 pages from SMMU pmap. * All the L3 entires must be cleared in advance, otherwise * this function panics. */ void iommu_pmap_remove_pages(pmap_t pmap) { pd_entry_t l0e, *l1, l1e, *l2, l2e; pt_entry_t *l3, l3e; vm_page_t m, m0, m1; vm_offset_t sva; vm_paddr_t pa; vm_paddr_t pa0; vm_paddr_t pa1; int i, j, k, l; PMAP_LOCK(pmap); for (sva = VM_MINUSER_ADDRESS, i = iommu_l0_index(sva); (i < Ln_ENTRIES && sva < VM_MAXUSER_ADDRESS); i++) { l0e = pmap->pm_l0[i]; if ((l0e & ATTR_DESCR_VALID) == 0) { sva += IOMMU_L0_SIZE; continue; } pa0 = l0e & ~ATTR_MASK; m0 = PHYS_TO_VM_PAGE(pa0); l1 = (pd_entry_t *)PHYS_TO_DMAP(pa0); for (j = iommu_l1_index(sva); j < Ln_ENTRIES; j++) { l1e = l1[j]; if ((l1e & ATTR_DESCR_VALID) == 0) { sva += IOMMU_L1_SIZE; continue; } if ((l1e & ATTR_DESCR_MASK) == IOMMU_L1_BLOCK) { sva += IOMMU_L1_SIZE; continue; } pa1 = l1e & ~ATTR_MASK; m1 = PHYS_TO_VM_PAGE(pa1); l2 = (pd_entry_t *)PHYS_TO_DMAP(pa1); for (k = iommu_l2_index(sva); k < Ln_ENTRIES; k++) { l2e = l2[k]; if ((l2e & ATTR_DESCR_VALID) == 0) { sva += IOMMU_L2_SIZE; continue; } pa = l2e & ~ATTR_MASK; m = PHYS_TO_VM_PAGE(pa); l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); for (l = iommu_l3_index(sva); l < Ln_ENTRIES; l++, sva += IOMMU_L3_SIZE) { l3e = l3[l]; if ((l3e & ATTR_DESCR_VALID) == 0) continue; panic("%s: l3e found for va %jx\n", __func__, sva); } vm_page_unwire_noq(m1); vm_page_unwire_noq(m); pmap_resident_count_dec(pmap, 1); vm_page_free(m); pmap_clear(&l2[k]); } vm_page_unwire_noq(m0); pmap_resident_count_dec(pmap, 1); vm_page_free(m1); pmap_clear(&l1[j]); } pmap_resident_count_dec(pmap, 1); vm_page_free(m0); pmap_clear(&pmap->pm_l0[i]); } KASSERT(pmap->pm_stats.resident_count == 0, ("Invalid resident count %jd", pmap->pm_stats.resident_count)); PMAP_UNLOCK(pmap); } diff --git a/sys/compat/linuxkpi/common/src/linux_page.c b/sys/compat/linuxkpi/common/src/linux_page.c index 8da4736ab7d5..f712c0c155a4 100644 --- a/sys/compat/linuxkpi/common/src/linux_page.c +++ b/sys/compat/linuxkpi/common/src/linux_page.c @@ -1,370 +1,370 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2016 Matthew Macy (mmacy@mattmacy.io) * Copyright (c) 2017 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void si_meminfo(struct sysinfo *si) { si->totalram = physmem; si->totalhigh = 0; si->mem_unit = PAGE_SIZE; } void * linux_page_address(struct page *page) { if (page->object != kernel_object) { return (PMAP_HAS_DMAP ? ((void *)(uintptr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(page))) : NULL); } return ((void *)(uintptr_t)(VM_MIN_KERNEL_ADDRESS + IDX_TO_OFF(page->pindex))); } vm_page_t linux_alloc_pages(gfp_t flags, unsigned int order) { vm_page_t page; if (PMAP_HAS_DMAP) { unsigned long npages = 1UL << order; int req = VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_NORMAL; if ((flags & M_ZERO) != 0) req |= VM_ALLOC_ZERO; if (order == 0 && (flags & GFP_DMA32) == 0) { - page = vm_page_alloc(NULL, 0, req); + page = vm_page_alloc_noobj(req); if (page == NULL) return (NULL); } else { vm_paddr_t pmax = (flags & GFP_DMA32) ? BUS_SPACE_MAXADDR_32BIT : BUS_SPACE_MAXADDR; retry: page = vm_page_alloc_contig(NULL, 0, req, npages, 0, pmax, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if (page == NULL) { if (flags & M_WAITOK) { if (!vm_page_reclaim_contig(req, npages, 0, pmax, PAGE_SIZE, 0)) { vm_wait(NULL); } flags &= ~M_WAITOK; goto retry; } return (NULL); } } if (flags & M_ZERO) { unsigned long x; for (x = 0; x != npages; x++) { vm_page_t pgo = page + x; if ((pgo->flags & PG_ZERO) == 0) pmap_zero_page(pgo); } } } else { vm_offset_t vaddr; vaddr = linux_alloc_kmem(flags, order); if (vaddr == 0) return (NULL); page = PHYS_TO_VM_PAGE(vtophys((void *)vaddr)); KASSERT(vaddr == (vm_offset_t)page_address(page), ("Page address mismatch")); } return (page); } void linux_free_pages(vm_page_t page, unsigned int order) { if (PMAP_HAS_DMAP) { unsigned long npages = 1UL << order; unsigned long x; for (x = 0; x != npages; x++) { vm_page_t pgo = page + x; if (vm_page_unwire_noq(pgo)) vm_page_free(pgo); } } else { vm_offset_t vaddr; vaddr = (vm_offset_t)page_address(page); linux_free_kmem(vaddr, order); } } vm_offset_t linux_alloc_kmem(gfp_t flags, unsigned int order) { size_t size = ((size_t)PAGE_SIZE) << order; vm_offset_t addr; if ((flags & GFP_DMA32) == 0) { addr = kmem_malloc(size, flags & GFP_NATIVE_MASK); } else { addr = kmem_alloc_contig(size, flags & GFP_NATIVE_MASK, 0, BUS_SPACE_MAXADDR_32BIT, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } return (addr); } void linux_free_kmem(vm_offset_t addr, unsigned int order) { size_t size = ((size_t)PAGE_SIZE) << order; kmem_free(addr, size); } static int linux_get_user_pages_internal(vm_map_t map, unsigned long start, int nr_pages, int write, struct page **pages) { vm_prot_t prot; size_t len; int count; prot = write ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ; len = ptoa((vm_offset_t)nr_pages); count = vm_fault_quick_hold_pages(map, start, len, prot, pages, nr_pages); return (count == -1 ? -EFAULT : nr_pages); } int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { vm_map_t map; vm_page_t *mp; vm_offset_t va; vm_offset_t end; vm_prot_t prot; int count; if (nr_pages == 0 || in_interrupt()) return (0); MPASS(pages != NULL); map = &curthread->td_proc->p_vmspace->vm_map; end = start + ptoa((vm_offset_t)nr_pages); if (!vm_map_range_valid(map, start, end)) return (-EINVAL); prot = write ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ; for (count = 0, mp = pages, va = start; va < end; mp++, va += PAGE_SIZE, count++) { *mp = pmap_extract_and_hold(map->pmap, va, prot); if (*mp == NULL) break; if ((prot & VM_PROT_WRITE) != 0 && (*mp)->dirty != VM_PAGE_BITS_ALL) { /* * Explicitly dirty the physical page. Otherwise, the * caller's changes may go unnoticed because they are * performed through an unmanaged mapping or by a DMA * operation. * * The object lock is not held here. * See vm_page_clear_dirty_mask(). */ vm_page_dirty(*mp); } } return (count); } long get_user_pages_remote(struct task_struct *task, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int gup_flags, struct page **pages, struct vm_area_struct **vmas) { vm_map_t map; map = &task->task_thread->td_proc->p_vmspace->vm_map; return (linux_get_user_pages_internal(map, start, nr_pages, !!(gup_flags & FOLL_WRITE), pages)); } long get_user_pages(unsigned long start, unsigned long nr_pages, int gup_flags, struct page **pages, struct vm_area_struct **vmas) { vm_map_t map; map = &curthread->td_proc->p_vmspace->vm_map; return (linux_get_user_pages_internal(map, start, nr_pages, !!(gup_flags & FOLL_WRITE), pages)); } int is_vmalloc_addr(const void *addr) { return (vtoslab((vm_offset_t)addr & ~UMA_SLAB_MASK) != NULL); } vm_fault_t lkpi_vmf_insert_pfn_prot_locked(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t prot) { vm_object_t vm_obj = vma->vm_obj; vm_object_t tmp_obj; vm_page_t page; vm_pindex_t pindex; VM_OBJECT_ASSERT_WLOCKED(vm_obj); pindex = OFF_TO_IDX(addr - vma->vm_start); if (vma->vm_pfn_count == 0) vma->vm_pfn_first = pindex; MPASS(pindex <= OFF_TO_IDX(vma->vm_end)); retry: page = vm_page_grab(vm_obj, pindex, VM_ALLOC_NOCREAT); if (page == NULL) { page = PHYS_TO_VM_PAGE(IDX_TO_OFF(pfn)); if (!vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL)) goto retry; if (page->object != NULL) { tmp_obj = page->object; vm_page_xunbusy(page); VM_OBJECT_WUNLOCK(vm_obj); VM_OBJECT_WLOCK(tmp_obj); if (page->object == tmp_obj && vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL)) { KASSERT(page->object == tmp_obj, ("page has changed identity")); KASSERT((page->oflags & VPO_UNMANAGED) == 0, ("page does not belong to shmem")); vm_pager_page_unswapped(page); if (pmap_page_is_mapped(page)) { vm_page_xunbusy(page); VM_OBJECT_WUNLOCK(tmp_obj); printf("%s: page rename failed: page " "is mapped\n", __func__); VM_OBJECT_WLOCK(vm_obj); return (VM_FAULT_NOPAGE); } vm_page_remove(page); } VM_OBJECT_WUNLOCK(tmp_obj); VM_OBJECT_WLOCK(vm_obj); goto retry; } if (vm_page_insert(page, vm_obj, pindex)) { vm_page_xunbusy(page); return (VM_FAULT_OOM); } vm_page_valid(page); } pmap_page_set_memattr(page, pgprot2cachemode(prot)); vma->vm_pfn_count++; return (VM_FAULT_NOPAGE); } /* * Although FreeBSD version of unmap_mapping_range has semantics and types of * parameters compatible with Linux version, the values passed in are different * @obj should match to vm_private_data field of vm_area_struct returned by * mmap file operation handler, see linux_file_mmap_single() sources * @holelen should match to size of area to be munmapped. */ void lkpi_unmap_mapping_range(void *obj, loff_t const holebegin __unused, loff_t const holelen, int even_cows __unused) { vm_object_t devobj; vm_page_t page; int i, page_count; devobj = cdev_pager_lookup(obj); if (devobj != NULL) { page_count = OFF_TO_IDX(holelen); VM_OBJECT_WLOCK(devobj); retry: for (i = 0; i < page_count; i++) { page = vm_page_lookup(devobj, i); if (page == NULL) continue; if (!vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL)) goto retry; cdev_pager_free_page(devobj, page); } VM_OBJECT_WUNLOCK(devobj); vm_object_deallocate(devobj); } } diff --git a/sys/dev/drm2/ttm/ttm_page_alloc.c b/sys/dev/drm2/ttm/ttm_page_alloc.c index fbb830405de0..b35a06520e07 100644 --- a/sys/dev/drm2/ttm/ttm_page_alloc.c +++ b/sys/dev/drm2/ttm/ttm_page_alloc.c @@ -1,926 +1,921 @@ /* * Copyright (c) Red Hat Inc. * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sub license, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Authors: Dave Airlie * Jerome Glisse * Pauli Nieminen */ /* * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. */ /* simple list based uncached page pool * - Pool collects resently freed pages for reuse * - Use page->lru to keep a free list * - doesn't track currently in use pages */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #define NUM_PAGES_TO_ALLOC (PAGE_SIZE/sizeof(vm_page_t)) #define SMALL_ALLOCATION 16 #define FREE_ALL_PAGES (~0U) /* times are in msecs */ #define PAGE_FREE_INTERVAL 1000 /** * struct ttm_page_pool - Pool to reuse recently allocated uc/wc pages. * * @lock: Protects the shared pool from concurrnet access. Must be used with * irqsave/irqrestore variants because pool allocator maybe called from * delayed work. * @fill_lock: Prevent concurrent calls to fill. * @list: Pool of free uc/wc pages for fast reuse. * @gfp_flags: Flags to pass for alloc_page. * @npages: Number of pages in pool. */ struct ttm_page_pool { struct mtx lock; bool fill_lock; bool dma32; struct pglist list; int ttm_page_alloc_flags; unsigned npages; char *name; unsigned long nfrees; unsigned long nrefills; }; /** * Limits for the pool. They are handled without locks because only place where * they may change is in sysfs store. They won't have immediate effect anyway * so forcing serialization to access them is pointless. */ struct ttm_pool_opts { unsigned alloc_size; unsigned max_size; unsigned small; }; #define NUM_POOLS 4 /** * struct ttm_pool_manager - Holds memory pools for fst allocation * * Manager is read only object for pool code so it doesn't need locking. * * @free_interval: minimum number of jiffies between freeing pages from pool. * @page_alloc_inited: reference counting for pool allocation. * @work: Work that is used to shrink the pool. Work is only run when there is * some pages to free. * @small_allocation: Limit in number of pages what is small allocation. * * @pools: All pool objects in use. **/ struct ttm_pool_manager { unsigned int kobj_ref; eventhandler_tag lowmem_handler; struct ttm_pool_opts options; union { struct ttm_page_pool u_pools[NUM_POOLS]; struct _utag { struct ttm_page_pool u_wc_pool; struct ttm_page_pool u_uc_pool; struct ttm_page_pool u_wc_pool_dma32; struct ttm_page_pool u_uc_pool_dma32; } _ut; } _u; }; #define pools _u.u_pools #define wc_pool _u._ut.u_wc_pool #define uc_pool _u._ut.u_uc_pool #define wc_pool_dma32 _u._ut.u_wc_pool_dma32 #define uc_pool_dma32 _u._ut.u_uc_pool_dma32 MALLOC_DEFINE(M_TTM_POOLMGR, "ttm_poolmgr", "TTM Pool Manager"); static void ttm_vm_page_free(vm_page_t m) { KASSERT(m->object == NULL, ("ttm page %p is owned", m)); KASSERT(vm_page_wired(m), ("ttm lost wire %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("ttm lost fictitious %p", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("ttm got unmanaged %p", m)); m->flags &= ~PG_FICTITIOUS; m->oflags |= VPO_UNMANAGED; vm_page_unwire_noq(m); vm_page_free(m); } static vm_memattr_t ttm_caching_state_to_vm(enum ttm_caching_state cstate) { switch (cstate) { case tt_uncached: return (VM_MEMATTR_UNCACHEABLE); case tt_wc: return (VM_MEMATTR_WRITE_COMBINING); case tt_cached: return (VM_MEMATTR_WRITE_BACK); } panic("caching state %d\n", cstate); } static vm_page_t ttm_vm_page_alloc_dma32(int req, vm_memattr_t memattr) { vm_page_t p; int tries; for (tries = 0; ; tries++) { p = vm_page_alloc_contig(NULL, 0, req, 1, 0, 0xffffffff, PAGE_SIZE, 0, memattr); if (p != NULL || tries > 2) return (p); if (!vm_page_reclaim_contig(req, 1, 0, 0xffffffff, PAGE_SIZE, 0)) vm_wait(NULL); } } static vm_page_t ttm_vm_page_alloc_any(int req, vm_memattr_t memattr) { vm_page_t p; - while (1) { - p = vm_page_alloc(NULL, 0, req); - if (p != NULL) - break; - vm_wait(NULL); - } + p = vm_page_alloc_noobj(req | VM_ALLOC_WAITOK); pmap_page_set_memattr(p, memattr); return (p); } static vm_page_t ttm_vm_page_alloc(int flags, enum ttm_caching_state cstate) { vm_page_t p; vm_memattr_t memattr; int req; memattr = ttm_caching_state_to_vm(cstate); req = VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ; if ((flags & TTM_PAGE_FLAG_ZERO_ALLOC) != 0) req |= VM_ALLOC_ZERO; if ((flags & TTM_PAGE_FLAG_DMA32) != 0) p = ttm_vm_page_alloc_dma32(req, memattr); else p = ttm_vm_page_alloc_any(req, memattr); if (p != NULL) { p->oflags &= ~VPO_UNMANAGED; p->flags |= PG_FICTITIOUS; } return (p); } static void ttm_pool_kobj_release(struct ttm_pool_manager *m) { free(m, M_TTM_POOLMGR); } #if 0 /* XXXKIB sysctl */ static ssize_t ttm_pool_store(struct ttm_pool_manager *m, struct attribute *attr, const char *buffer, size_t size) { int chars; unsigned val; chars = sscanf(buffer, "%u", &val); if (chars == 0) return size; /* Convert kb to number of pages */ val = val / (PAGE_SIZE >> 10); if (attr == &ttm_page_pool_max) m->options.max_size = val; else if (attr == &ttm_page_pool_small) m->options.small = val; else if (attr == &ttm_page_pool_alloc_size) { if (val > NUM_PAGES_TO_ALLOC*8) { pr_err("Setting allocation size to %lu is not allowed. Recommended size is %lu\n", NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 7), NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 10)); return size; } else if (val > NUM_PAGES_TO_ALLOC) { pr_warn("Setting allocation size to larger than %lu is not recommended\n", NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 10)); } m->options.alloc_size = val; } return size; } static ssize_t ttm_pool_show(struct ttm_pool_manager *m, struct attribute *attr, char *buffer) { unsigned val = 0; if (attr == &ttm_page_pool_max) val = m->options.max_size; else if (attr == &ttm_page_pool_small) val = m->options.small; else if (attr == &ttm_page_pool_alloc_size) val = m->options.alloc_size; val = val * (PAGE_SIZE >> 10); return snprintf(buffer, PAGE_SIZE, "%u\n", val); } #endif static struct ttm_pool_manager *_manager; static int set_pages_array_wb(vm_page_t *pages, int addrinarray) { #ifdef TTM_HAS_AGP int i; for (i = 0; i < addrinarray; i++) pmap_page_set_memattr(pages[i], VM_MEMATTR_WRITE_BACK); #endif return 0; } static int set_pages_array_wc(vm_page_t *pages, int addrinarray) { #ifdef TTM_HAS_AGP int i; for (i = 0; i < addrinarray; i++) pmap_page_set_memattr(pages[i], VM_MEMATTR_WRITE_COMBINING); #endif return 0; } static int set_pages_array_uc(vm_page_t *pages, int addrinarray) { #ifdef TTM_HAS_AGP int i; for (i = 0; i < addrinarray; i++) pmap_page_set_memattr(pages[i], VM_MEMATTR_UNCACHEABLE); #endif return 0; } /** * Select the right pool or requested caching state and ttm flags. */ static struct ttm_page_pool *ttm_get_pool(int flags, enum ttm_caching_state cstate) { int pool_index; if (cstate == tt_cached) return NULL; if (cstate == tt_wc) pool_index = 0x0; else pool_index = 0x1; if (flags & TTM_PAGE_FLAG_DMA32) pool_index |= 0x2; return &_manager->pools[pool_index]; } /* set memory back to wb and free the pages. */ static void ttm_pages_put(vm_page_t *pages, unsigned npages) { unsigned i; /* Our VM handles vm memattr automatically on the page free. */ if (set_pages_array_wb(pages, npages)) printf("[TTM] Failed to set %d pages to wb!\n", npages); for (i = 0; i < npages; ++i) ttm_vm_page_free(pages[i]); } static void ttm_pool_update_free_locked(struct ttm_page_pool *pool, unsigned freed_pages) { pool->npages -= freed_pages; pool->nfrees += freed_pages; } /** * Free pages from pool. * * To prevent hogging the ttm_swap process we only free NUM_PAGES_TO_ALLOC * number of pages in one go. * * @pool: to free the pages from * @free_all: If set to true will free all pages in pool **/ static int ttm_page_pool_free(struct ttm_page_pool *pool, unsigned nr_free) { vm_page_t p, p1; vm_page_t *pages_to_free; unsigned freed_pages = 0, npages_to_free = nr_free; unsigned i; if (NUM_PAGES_TO_ALLOC < nr_free) npages_to_free = NUM_PAGES_TO_ALLOC; pages_to_free = malloc(npages_to_free * sizeof(vm_page_t), M_TEMP, M_WAITOK | M_ZERO); restart: mtx_lock(&pool->lock); TAILQ_FOREACH_REVERSE_SAFE(p, &pool->list, pglist, plinks.q, p1) { if (freed_pages >= npages_to_free) break; pages_to_free[freed_pages++] = p; /* We can only remove NUM_PAGES_TO_ALLOC at a time. */ if (freed_pages >= NUM_PAGES_TO_ALLOC) { /* remove range of pages from the pool */ for (i = 0; i < freed_pages; i++) TAILQ_REMOVE(&pool->list, pages_to_free[i], plinks.q); ttm_pool_update_free_locked(pool, freed_pages); /** * Because changing page caching is costly * we unlock the pool to prevent stalling. */ mtx_unlock(&pool->lock); ttm_pages_put(pages_to_free, freed_pages); if (likely(nr_free != FREE_ALL_PAGES)) nr_free -= freed_pages; if (NUM_PAGES_TO_ALLOC >= nr_free) npages_to_free = nr_free; else npages_to_free = NUM_PAGES_TO_ALLOC; freed_pages = 0; /* free all so restart the processing */ if (nr_free) goto restart; /* Not allowed to fall through or break because * following context is inside spinlock while we are * outside here. */ goto out; } } /* remove range of pages from the pool */ if (freed_pages) { for (i = 0; i < freed_pages; i++) TAILQ_REMOVE(&pool->list, pages_to_free[i], plinks.q); ttm_pool_update_free_locked(pool, freed_pages); nr_free -= freed_pages; } mtx_unlock(&pool->lock); if (freed_pages) ttm_pages_put(pages_to_free, freed_pages); out: free(pages_to_free, M_TEMP); return nr_free; } /* Get good estimation how many pages are free in pools */ static int ttm_pool_get_num_unused_pages(void) { unsigned i; int total = 0; for (i = 0; i < NUM_POOLS; ++i) total += _manager->pools[i].npages; return total; } /** * Callback for mm to request pool to reduce number of page held. */ static int ttm_pool_mm_shrink(void *arg) { static unsigned int start_pool = 0; unsigned i; unsigned pool_offset = atomic_fetchadd_int(&start_pool, 1); struct ttm_page_pool *pool; int shrink_pages = 100; /* XXXKIB */ pool_offset = pool_offset % NUM_POOLS; /* select start pool in round robin fashion */ for (i = 0; i < NUM_POOLS; ++i) { unsigned nr_free = shrink_pages; if (shrink_pages == 0) break; pool = &_manager->pools[(i + pool_offset)%NUM_POOLS]; shrink_pages = ttm_page_pool_free(pool, nr_free); } /* return estimated number of unused pages in pool */ return ttm_pool_get_num_unused_pages(); } static void ttm_pool_mm_shrink_init(struct ttm_pool_manager *manager) { manager->lowmem_handler = EVENTHANDLER_REGISTER(vm_lowmem, ttm_pool_mm_shrink, manager, EVENTHANDLER_PRI_ANY); } static void ttm_pool_mm_shrink_fini(struct ttm_pool_manager *manager) { EVENTHANDLER_DEREGISTER(vm_lowmem, manager->lowmem_handler); } static int ttm_set_pages_caching(vm_page_t *pages, enum ttm_caching_state cstate, unsigned cpages) { int r = 0; /* Set page caching */ switch (cstate) { case tt_uncached: r = set_pages_array_uc(pages, cpages); if (r) printf("[TTM] Failed to set %d pages to uc!\n", cpages); break; case tt_wc: r = set_pages_array_wc(pages, cpages); if (r) printf("[TTM] Failed to set %d pages to wc!\n", cpages); break; default: break; } return r; } /** * Free pages the pages that failed to change the caching state. If there is * any pages that have changed their caching state already put them to the * pool. */ static void ttm_handle_caching_state_failure(struct pglist *pages, int ttm_flags, enum ttm_caching_state cstate, vm_page_t *failed_pages, unsigned cpages) { unsigned i; /* Failed pages have to be freed */ for (i = 0; i < cpages; ++i) { TAILQ_REMOVE(pages, failed_pages[i], plinks.q); ttm_vm_page_free(failed_pages[i]); } } /** * Allocate new pages with correct caching. * * This function is reentrant if caller updates count depending on number of * pages returned in pages array. */ static int ttm_alloc_new_pages(struct pglist *pages, int ttm_alloc_flags, int ttm_flags, enum ttm_caching_state cstate, unsigned count) { vm_page_t *caching_array; vm_page_t p; int r = 0; unsigned i, cpages; unsigned max_cpages = min(count, (unsigned)(PAGE_SIZE/sizeof(vm_page_t))); /* allocate array for page caching change */ caching_array = malloc(max_cpages * sizeof(vm_page_t), M_TEMP, M_WAITOK | M_ZERO); for (i = 0, cpages = 0; i < count; ++i) { p = ttm_vm_page_alloc(ttm_alloc_flags, cstate); if (!p) { printf("[TTM] Unable to get page %u\n", i); /* store already allocated pages in the pool after * setting the caching state */ if (cpages) { r = ttm_set_pages_caching(caching_array, cstate, cpages); if (r) ttm_handle_caching_state_failure(pages, ttm_flags, cstate, caching_array, cpages); } r = -ENOMEM; goto out; } #ifdef CONFIG_HIGHMEM /* KIB: nop */ /* gfp flags of highmem page should never be dma32 so we * we should be fine in such case */ if (!PageHighMem(p)) #endif { caching_array[cpages++] = p; if (cpages == max_cpages) { r = ttm_set_pages_caching(caching_array, cstate, cpages); if (r) { ttm_handle_caching_state_failure(pages, ttm_flags, cstate, caching_array, cpages); goto out; } cpages = 0; } } TAILQ_INSERT_HEAD(pages, p, plinks.q); } if (cpages) { r = ttm_set_pages_caching(caching_array, cstate, cpages); if (r) ttm_handle_caching_state_failure(pages, ttm_flags, cstate, caching_array, cpages); } out: free(caching_array, M_TEMP); return r; } /** * Fill the given pool if there aren't enough pages and the requested number of * pages is small. */ static void ttm_page_pool_fill_locked(struct ttm_page_pool *pool, int ttm_flags, enum ttm_caching_state cstate, unsigned count) { vm_page_t p; int r; unsigned cpages = 0; /** * Only allow one pool fill operation at a time. * If pool doesn't have enough pages for the allocation new pages are * allocated from outside of pool. */ if (pool->fill_lock) return; pool->fill_lock = true; /* If allocation request is small and there are not enough * pages in a pool we fill the pool up first. */ if (count < _manager->options.small && count > pool->npages) { struct pglist new_pages; unsigned alloc_size = _manager->options.alloc_size; /** * Can't change page caching if in irqsave context. We have to * drop the pool->lock. */ mtx_unlock(&pool->lock); TAILQ_INIT(&new_pages); r = ttm_alloc_new_pages(&new_pages, pool->ttm_page_alloc_flags, ttm_flags, cstate, alloc_size); mtx_lock(&pool->lock); if (!r) { TAILQ_CONCAT(&pool->list, &new_pages, plinks.q); ++pool->nrefills; pool->npages += alloc_size; } else { printf("[TTM] Failed to fill pool (%p)\n", pool); /* If we have any pages left put them to the pool. */ TAILQ_FOREACH(p, &pool->list, plinks.q) { ++cpages; } TAILQ_CONCAT(&pool->list, &new_pages, plinks.q); pool->npages += cpages; } } pool->fill_lock = false; } /** * Cut 'count' number of pages from the pool and put them on the return list. * * @return count of pages still required to fulfill the request. */ static unsigned ttm_page_pool_get_pages(struct ttm_page_pool *pool, struct pglist *pages, int ttm_flags, enum ttm_caching_state cstate, unsigned count) { vm_page_t p; unsigned i; mtx_lock(&pool->lock); ttm_page_pool_fill_locked(pool, ttm_flags, cstate, count); if (count >= pool->npages) { /* take all pages from the pool */ TAILQ_CONCAT(pages, &pool->list, plinks.q); count -= pool->npages; pool->npages = 0; goto out; } for (i = 0; i < count; i++) { p = TAILQ_FIRST(&pool->list); TAILQ_REMOVE(&pool->list, p, plinks.q); TAILQ_INSERT_TAIL(pages, p, plinks.q); } pool->npages -= count; count = 0; out: mtx_unlock(&pool->lock); return count; } /* Put all pages in pages list to correct pool to wait for reuse */ static void ttm_put_pages(vm_page_t *pages, unsigned npages, int flags, enum ttm_caching_state cstate) { struct ttm_page_pool *pool = ttm_get_pool(flags, cstate); unsigned i; if (pool == NULL) { /* No pool for this memory type so free the pages */ for (i = 0; i < npages; i++) { if (pages[i]) { ttm_vm_page_free(pages[i]); pages[i] = NULL; } } return; } mtx_lock(&pool->lock); for (i = 0; i < npages; i++) { if (pages[i]) { TAILQ_INSERT_TAIL(&pool->list, pages[i], plinks.q); pages[i] = NULL; pool->npages++; } } /* Check that we don't go over the pool limit */ npages = 0; if (pool->npages > _manager->options.max_size) { npages = pool->npages - _manager->options.max_size; /* free at least NUM_PAGES_TO_ALLOC number of pages * to reduce calls to set_memory_wb */ if (npages < NUM_PAGES_TO_ALLOC) npages = NUM_PAGES_TO_ALLOC; } mtx_unlock(&pool->lock); if (npages) ttm_page_pool_free(pool, npages); } /* * On success pages list will hold count number of correctly * cached pages. */ static int ttm_get_pages(vm_page_t *pages, unsigned npages, int flags, enum ttm_caching_state cstate) { struct ttm_page_pool *pool = ttm_get_pool(flags, cstate); struct pglist plist; vm_page_t p = NULL; int gfp_flags; unsigned count; int r; /* No pool for cached pages */ if (pool == NULL) { for (r = 0; r < npages; ++r) { p = ttm_vm_page_alloc(flags, cstate); if (!p) { printf("[TTM] Unable to allocate page\n"); return -ENOMEM; } pages[r] = p; } return 0; } /* combine zero flag to pool flags */ gfp_flags = flags | pool->ttm_page_alloc_flags; /* First we take pages from the pool */ TAILQ_INIT(&plist); npages = ttm_page_pool_get_pages(pool, &plist, flags, cstate, npages); count = 0; TAILQ_FOREACH(p, &plist, plinks.q) { pages[count++] = p; } /* clear the pages coming from the pool if requested */ if (flags & TTM_PAGE_FLAG_ZERO_ALLOC) { TAILQ_FOREACH(p, &plist, plinks.q) { pmap_zero_page(p); } } /* If pool didn't have enough pages allocate new one. */ if (npages > 0) { /* ttm_alloc_new_pages doesn't reference pool so we can run * multiple requests in parallel. **/ TAILQ_INIT(&plist); r = ttm_alloc_new_pages(&plist, gfp_flags, flags, cstate, npages); TAILQ_FOREACH(p, &plist, plinks.q) { pages[count++] = p; } if (r) { /* If there is any pages in the list put them back to * the pool. */ printf("[TTM] Failed to allocate extra pages for large request\n"); ttm_put_pages(pages, count, flags, cstate); return r; } } return 0; } static void ttm_page_pool_init_locked(struct ttm_page_pool *pool, int flags, char *name) { mtx_init(&pool->lock, "ttmpool", NULL, MTX_DEF); pool->fill_lock = false; TAILQ_INIT(&pool->list); pool->npages = pool->nfrees = 0; pool->ttm_page_alloc_flags = flags; pool->name = name; } int ttm_page_alloc_init(struct ttm_mem_global *glob, unsigned max_pages) { if (_manager != NULL) printf("[TTM] manager != NULL\n"); printf("[TTM] Initializing pool allocator\n"); _manager = malloc(sizeof(*_manager), M_TTM_POOLMGR, M_WAITOK | M_ZERO); ttm_page_pool_init_locked(&_manager->wc_pool, 0, "wc"); ttm_page_pool_init_locked(&_manager->uc_pool, 0, "uc"); ttm_page_pool_init_locked(&_manager->wc_pool_dma32, TTM_PAGE_FLAG_DMA32, "wc dma"); ttm_page_pool_init_locked(&_manager->uc_pool_dma32, TTM_PAGE_FLAG_DMA32, "uc dma"); _manager->options.max_size = max_pages; _manager->options.small = SMALL_ALLOCATION; _manager->options.alloc_size = NUM_PAGES_TO_ALLOC; refcount_init(&_manager->kobj_ref, 1); ttm_pool_mm_shrink_init(_manager); return 0; } void ttm_page_alloc_fini(void) { int i; printf("[TTM] Finalizing pool allocator\n"); ttm_pool_mm_shrink_fini(_manager); for (i = 0; i < NUM_POOLS; ++i) ttm_page_pool_free(&_manager->pools[i], FREE_ALL_PAGES); if (refcount_release(&_manager->kobj_ref)) ttm_pool_kobj_release(_manager); _manager = NULL; } int ttm_pool_populate(struct ttm_tt *ttm) { struct ttm_mem_global *mem_glob = ttm->glob->mem_glob; unsigned i; int ret; if (ttm->state != tt_unpopulated) return 0; for (i = 0; i < ttm->num_pages; ++i) { ret = ttm_get_pages(&ttm->pages[i], 1, ttm->page_flags, ttm->caching_state); if (ret != 0) { ttm_pool_unpopulate(ttm); return -ENOMEM; } ret = ttm_mem_global_alloc_page(mem_glob, ttm->pages[i], false, false); if (unlikely(ret != 0)) { ttm_pool_unpopulate(ttm); return -ENOMEM; } } if (unlikely(ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)) { ret = ttm_tt_swapin(ttm); if (unlikely(ret != 0)) { ttm_pool_unpopulate(ttm); return ret; } } ttm->state = tt_unbound; return 0; } void ttm_pool_unpopulate(struct ttm_tt *ttm) { unsigned i; for (i = 0; i < ttm->num_pages; ++i) { if (ttm->pages[i]) { ttm_mem_global_free_page(ttm->glob->mem_glob, ttm->pages[i]); ttm_put_pages(&ttm->pages[i], 1, ttm->page_flags, ttm->caching_state); } } ttm->state = tt_unpopulated; } #if 0 /* XXXKIB sysctl */ int ttm_page_alloc_debugfs(struct seq_file *m, void *data) { struct ttm_page_pool *p; unsigned i; char *h[] = {"pool", "refills", "pages freed", "size"}; if (!_manager) { seq_printf(m, "No pool allocator running.\n"); return 0; } seq_printf(m, "%6s %12s %13s %8s\n", h[0], h[1], h[2], h[3]); for (i = 0; i < NUM_POOLS; ++i) { p = &_manager->pools[i]; seq_printf(m, "%6s %12ld %13ld %8d\n", p->name, p->nrefills, p->nfrees, p->npages); } return 0; } #endif diff --git a/sys/dev/ti/if_ti.c b/sys/dev/ti/if_ti.c index 9871212d6379..e8b4eb537883 100644 --- a/sys/dev/ti/if_ti.c +++ b/sys/dev/ti/if_ti.c @@ -1,4038 +1,4037 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1997, 1998, 1999 * Bill Paul . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Bill Paul. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY Bill Paul AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL Bill Paul OR THE VOICES IN HIS HEAD * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * Alteon Networks Tigon PCI gigabit ethernet driver for FreeBSD. * Manuals, sample driver and firmware source kits are available * from http://www.alteon.com/support/openkits. * * Written by Bill Paul * Electrical Engineering Department * Columbia University, New York City */ /* * The Alteon Networks Tigon chip contains an embedded R4000 CPU, * gigabit MAC, dual DMA channels and a PCI interface unit. NICs * using the Tigon may have anywhere from 512K to 2MB of SRAM. The * Tigon supports hardware IP, TCP and UCP checksumming, multicast * filtering and jumbo (9014 byte) frames. The hardware is largely * controlled by firmware, which must be loaded into the NIC during * initialization. * * The Tigon 2 contains 2 R4000 CPUs and requires a newer firmware * revision, which supports new features such as extended commands, * extended jumbo receive ring descriptors and a mini receive ring. * * Alteon Networks is to be commended for releasing such a vast amount * of development material for the Tigon NIC without requiring an NDA * (although they really should have done it a long time ago). With * any luck, the other vendors will finally wise up and follow Alteon's * stellar example. * * The firmware for the Tigon 1 and 2 NICs is compiled directly into * this driver by #including it as a C header file. This bloats the * driver somewhat, but it's the easiest method considering that the * driver code and firmware code need to be kept in sync. The source * for the firmware is not provided with the FreeBSD distribution since * compiling it requires a GNU toolchain targeted for mips-sgi-irix5.3. * * The following people deserve special thanks: * - Terry Murphy of 3Com, for providing a 3c985 Tigon 1 board * for testing * - Raymond Lee of Netgear, for providing a pair of Netgear * GA620 Tigon 2 boards for testing * - Ulf Zimmermann, for bringing the GA260 to my attention and * convincing me to write this driver. * - Andrew Gallatin for providing FreeBSD/Alpha support. */ #include __FBSDID("$FreeBSD$"); #include "opt_ti.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TI_SF_BUF_JUMBO #include #include #endif #include #include #include #include #include #include #include #define TI_CSUM_FEATURES (CSUM_IP | CSUM_TCP | CSUM_UDP) /* * We can only turn on header splitting if we're using extended receive * BDs. */ #if defined(TI_JUMBO_HDRSPLIT) && !defined(TI_SF_BUF_JUMBO) #error "options TI_JUMBO_HDRSPLIT requires TI_SF_BUF_JUMBO" #endif /* TI_JUMBO_HDRSPLIT && !TI_SF_BUF_JUMBO */ typedef enum { TI_SWAP_HTON, TI_SWAP_NTOH } ti_swap_type; /* * Various supported device vendors/types and their names. */ static const struct ti_type ti_devs[] = { { ALT_VENDORID, ALT_DEVICEID_ACENIC, "Alteon AceNIC 1000baseSX Gigabit Ethernet" }, { ALT_VENDORID, ALT_DEVICEID_ACENIC_COPPER, "Alteon AceNIC 1000baseT Gigabit Ethernet" }, { TC_VENDORID, TC_DEVICEID_3C985, "3Com 3c985-SX Gigabit Ethernet" }, { NG_VENDORID, NG_DEVICEID_GA620, "Netgear GA620 1000baseSX Gigabit Ethernet" }, { NG_VENDORID, NG_DEVICEID_GA620T, "Netgear GA620 1000baseT Gigabit Ethernet" }, { SGI_VENDORID, SGI_DEVICEID_TIGON, "Silicon Graphics Gigabit Ethernet" }, { DEC_VENDORID, DEC_DEVICEID_FARALLON_PN9000SX, "Farallon PN9000SX Gigabit Ethernet" }, { 0, 0, NULL } }; static d_open_t ti_open; static d_close_t ti_close; static d_ioctl_t ti_ioctl2; static struct cdevsw ti_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = ti_open, .d_close = ti_close, .d_ioctl = ti_ioctl2, .d_name = "ti", }; static int ti_probe(device_t); static int ti_attach(device_t); static int ti_detach(device_t); static void ti_txeof(struct ti_softc *); static void ti_rxeof(struct ti_softc *); static int ti_encap(struct ti_softc *, struct mbuf **); static void ti_intr(void *); static void ti_start(struct ifnet *); static void ti_start_locked(struct ifnet *); static int ti_ioctl(struct ifnet *, u_long, caddr_t); static uint64_t ti_get_counter(struct ifnet *, ift_counter); static void ti_init(void *); static void ti_init_locked(void *); static void ti_init2(struct ti_softc *); static void ti_stop(struct ti_softc *); static void ti_watchdog(void *); static int ti_shutdown(device_t); static int ti_ifmedia_upd(struct ifnet *); static int ti_ifmedia_upd_locked(struct ti_softc *); static void ti_ifmedia_sts(struct ifnet *, struct ifmediareq *); static uint32_t ti_eeprom_putbyte(struct ti_softc *, int); static uint8_t ti_eeprom_getbyte(struct ti_softc *, int, uint8_t *); static int ti_read_eeprom(struct ti_softc *, caddr_t, int, int); static u_int ti_add_mcast(void *, struct sockaddr_dl *, u_int); static u_int ti_del_mcast(void *, struct sockaddr_dl *, u_int); static void ti_setmulti(struct ti_softc *); static void ti_mem_read(struct ti_softc *, uint32_t, uint32_t, void *); static void ti_mem_write(struct ti_softc *, uint32_t, uint32_t, void *); static void ti_mem_zero(struct ti_softc *, uint32_t, uint32_t); static int ti_copy_mem(struct ti_softc *, uint32_t, uint32_t, caddr_t, int, int); static int ti_copy_scratch(struct ti_softc *, uint32_t, uint32_t, caddr_t, int, int, int); static int ti_bcopy_swap(const void *, void *, size_t, ti_swap_type); static void ti_loadfw(struct ti_softc *); static void ti_cmd(struct ti_softc *, struct ti_cmd_desc *); static void ti_cmd_ext(struct ti_softc *, struct ti_cmd_desc *, caddr_t, int); static void ti_handle_events(struct ti_softc *); static void ti_dma_map_addr(void *, bus_dma_segment_t *, int, int); static int ti_dma_alloc(struct ti_softc *); static void ti_dma_free(struct ti_softc *); static int ti_dma_ring_alloc(struct ti_softc *, bus_size_t, bus_size_t, bus_dma_tag_t *, uint8_t **, bus_dmamap_t *, bus_addr_t *, const char *); static void ti_dma_ring_free(struct ti_softc *, bus_dma_tag_t *, uint8_t **, bus_dmamap_t, bus_addr_t *); static int ti_newbuf_std(struct ti_softc *, int); static int ti_newbuf_mini(struct ti_softc *, int); static int ti_newbuf_jumbo(struct ti_softc *, int, struct mbuf *); static int ti_init_rx_ring_std(struct ti_softc *); static void ti_free_rx_ring_std(struct ti_softc *); static int ti_init_rx_ring_jumbo(struct ti_softc *); static void ti_free_rx_ring_jumbo(struct ti_softc *); static int ti_init_rx_ring_mini(struct ti_softc *); static void ti_free_rx_ring_mini(struct ti_softc *); static void ti_free_tx_ring(struct ti_softc *); static int ti_init_tx_ring(struct ti_softc *); static void ti_discard_std(struct ti_softc *, int); #ifndef TI_SF_BUF_JUMBO static void ti_discard_jumbo(struct ti_softc *, int); #endif static void ti_discard_mini(struct ti_softc *, int); static int ti_64bitslot_war(struct ti_softc *); static int ti_chipinit(struct ti_softc *); static int ti_gibinit(struct ti_softc *); #ifdef TI_JUMBO_HDRSPLIT static __inline void ti_hdr_split(struct mbuf *top, int hdr_len, int pkt_len, int idx); #endif /* TI_JUMBO_HDRSPLIT */ static void ti_sysctl_node(struct ti_softc *); static device_method_t ti_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ti_probe), DEVMETHOD(device_attach, ti_attach), DEVMETHOD(device_detach, ti_detach), DEVMETHOD(device_shutdown, ti_shutdown), { 0, 0 } }; static driver_t ti_driver = { "ti", ti_methods, sizeof(struct ti_softc) }; static devclass_t ti_devclass; DRIVER_MODULE(ti, pci, ti_driver, ti_devclass, 0, 0); MODULE_DEPEND(ti, pci, 1, 1, 1); MODULE_DEPEND(ti, ether, 1, 1, 1); /* * Send an instruction or address to the EEPROM, check for ACK. */ static uint32_t ti_eeprom_putbyte(struct ti_softc *sc, int byte) { int i, ack = 0; /* * Make sure we're in TX mode. */ TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_TXEN); /* * Feed in each bit and stobe the clock. */ for (i = 0x80; i; i >>= 1) { if (byte & i) { TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_DOUT); } else { TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_DOUT); } DELAY(1); TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); DELAY(1); TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); } /* * Turn off TX mode. */ TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_TXEN); /* * Check for ack. */ TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); ack = CSR_READ_4(sc, TI_MISC_LOCAL_CTL) & TI_MLC_EE_DIN; TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); return (ack); } /* * Read a byte of data stored in the EEPROM at address 'addr.' * We have to send two address bytes since the EEPROM can hold * more than 256 bytes of data. */ static uint8_t ti_eeprom_getbyte(struct ti_softc *sc, int addr, uint8_t *dest) { int i; uint8_t byte = 0; EEPROM_START; /* * Send write control code to EEPROM. */ if (ti_eeprom_putbyte(sc, EEPROM_CTL_WRITE)) { device_printf(sc->ti_dev, "failed to send write command, status: %x\n", CSR_READ_4(sc, TI_MISC_LOCAL_CTL)); return (1); } /* * Send first byte of address of byte we want to read. */ if (ti_eeprom_putbyte(sc, (addr >> 8) & 0xFF)) { device_printf(sc->ti_dev, "failed to send address, status: %x\n", CSR_READ_4(sc, TI_MISC_LOCAL_CTL)); return (1); } /* * Send second byte address of byte we want to read. */ if (ti_eeprom_putbyte(sc, addr & 0xFF)) { device_printf(sc->ti_dev, "failed to send address, status: %x\n", CSR_READ_4(sc, TI_MISC_LOCAL_CTL)); return (1); } EEPROM_STOP; EEPROM_START; /* * Send read control code to EEPROM. */ if (ti_eeprom_putbyte(sc, EEPROM_CTL_READ)) { device_printf(sc->ti_dev, "failed to send read command, status: %x\n", CSR_READ_4(sc, TI_MISC_LOCAL_CTL)); return (1); } /* * Start reading bits from EEPROM. */ TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_TXEN); for (i = 0x80; i; i >>= 1) { TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); DELAY(1); if (CSR_READ_4(sc, TI_MISC_LOCAL_CTL) & TI_MLC_EE_DIN) byte |= i; TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); DELAY(1); } EEPROM_STOP; /* * No ACK generated for read, so just return byte. */ *dest = byte; return (0); } /* * Read a sequence of bytes from the EEPROM. */ static int ti_read_eeprom(struct ti_softc *sc, caddr_t dest, int off, int cnt) { int err = 0, i; uint8_t byte = 0; for (i = 0; i < cnt; i++) { err = ti_eeprom_getbyte(sc, off + i, &byte); if (err) break; *(dest + i) = byte; } return (err ? 1 : 0); } /* * NIC memory read function. * Can be used to copy data from NIC local memory. */ static void ti_mem_read(struct ti_softc *sc, uint32_t addr, uint32_t len, void *buf) { int segptr, segsize, cnt; char *ptr; segptr = addr; cnt = len; ptr = buf; while (cnt) { if (cnt < TI_WINLEN) segsize = cnt; else segsize = TI_WINLEN - (segptr % TI_WINLEN); CSR_WRITE_4(sc, TI_WINBASE, rounddown2(segptr, TI_WINLEN)); bus_space_read_region_4(sc->ti_btag, sc->ti_bhandle, TI_WINDOW + (segptr & (TI_WINLEN - 1)), (uint32_t *)ptr, segsize / 4); ptr += segsize; segptr += segsize; cnt -= segsize; } } /* * NIC memory write function. * Can be used to copy data into NIC local memory. */ static void ti_mem_write(struct ti_softc *sc, uint32_t addr, uint32_t len, void *buf) { int segptr, segsize, cnt; char *ptr; segptr = addr; cnt = len; ptr = buf; while (cnt) { if (cnt < TI_WINLEN) segsize = cnt; else segsize = TI_WINLEN - (segptr % TI_WINLEN); CSR_WRITE_4(sc, TI_WINBASE, rounddown2(segptr, TI_WINLEN)); bus_space_write_region_4(sc->ti_btag, sc->ti_bhandle, TI_WINDOW + (segptr & (TI_WINLEN - 1)), (uint32_t *)ptr, segsize / 4); ptr += segsize; segptr += segsize; cnt -= segsize; } } /* * NIC memory read function. * Can be used to clear a section of NIC local memory. */ static void ti_mem_zero(struct ti_softc *sc, uint32_t addr, uint32_t len) { int segptr, segsize, cnt; segptr = addr; cnt = len; while (cnt) { if (cnt < TI_WINLEN) segsize = cnt; else segsize = TI_WINLEN - (segptr % TI_WINLEN); CSR_WRITE_4(sc, TI_WINBASE, rounddown2(segptr, TI_WINLEN)); bus_space_set_region_4(sc->ti_btag, sc->ti_bhandle, TI_WINDOW + (segptr & (TI_WINLEN - 1)), 0, segsize / 4); segptr += segsize; cnt -= segsize; } } static int ti_copy_mem(struct ti_softc *sc, uint32_t tigon_addr, uint32_t len, caddr_t buf, int useraddr, int readdata) { int segptr, segsize, cnt; caddr_t ptr; uint32_t origwin; int resid, segresid; int first_pass; TI_LOCK_ASSERT(sc); /* * At the moment, we don't handle non-aligned cases, we just bail. * If this proves to be a problem, it will be fixed. */ if (readdata == 0 && (tigon_addr & 0x3) != 0) { device_printf(sc->ti_dev, "%s: tigon address %#x isn't " "word-aligned\n", __func__, tigon_addr); device_printf(sc->ti_dev, "%s: unaligned writes aren't " "yet supported\n", __func__); return (EINVAL); } segptr = tigon_addr & ~0x3; segresid = tigon_addr - segptr; /* * This is the non-aligned amount left over that we'll need to * copy. */ resid = len & 0x3; /* Add in the left over amount at the front of the buffer */ resid += segresid; cnt = len & ~0x3; /* * If resid + segresid is >= 4, add multiples of 4 to the count and * decrease the residual by that much. */ cnt += resid & ~0x3; resid -= resid & ~0x3; ptr = buf; first_pass = 1; /* * Save the old window base value. */ origwin = CSR_READ_4(sc, TI_WINBASE); while (cnt) { bus_size_t ti_offset; if (cnt < TI_WINLEN) segsize = cnt; else segsize = TI_WINLEN - (segptr % TI_WINLEN); CSR_WRITE_4(sc, TI_WINBASE, rounddown2(segptr, TI_WINLEN)); ti_offset = TI_WINDOW + (segptr & (TI_WINLEN -1)); if (readdata) { bus_space_read_region_4(sc->ti_btag, sc->ti_bhandle, ti_offset, (uint32_t *)sc->ti_membuf, segsize >> 2); if (useraddr) { /* * Yeah, this is a little on the kludgy * side, but at least this code is only * used for debugging. */ ti_bcopy_swap(sc->ti_membuf, sc->ti_membuf2, segsize, TI_SWAP_NTOH); TI_UNLOCK(sc); if (first_pass) { copyout(&sc->ti_membuf2[segresid], ptr, segsize - segresid); first_pass = 0; } else copyout(sc->ti_membuf2, ptr, segsize); TI_LOCK(sc); } else { if (first_pass) { ti_bcopy_swap(sc->ti_membuf, sc->ti_membuf2, segsize, TI_SWAP_NTOH); TI_UNLOCK(sc); bcopy(&sc->ti_membuf2[segresid], ptr, segsize - segresid); TI_LOCK(sc); first_pass = 0; } else ti_bcopy_swap(sc->ti_membuf, ptr, segsize, TI_SWAP_NTOH); } } else { if (useraddr) { TI_UNLOCK(sc); copyin(ptr, sc->ti_membuf2, segsize); TI_LOCK(sc); ti_bcopy_swap(sc->ti_membuf2, sc->ti_membuf, segsize, TI_SWAP_HTON); } else ti_bcopy_swap(ptr, sc->ti_membuf, segsize, TI_SWAP_HTON); bus_space_write_region_4(sc->ti_btag, sc->ti_bhandle, ti_offset, (uint32_t *)sc->ti_membuf, segsize >> 2); } segptr += segsize; ptr += segsize; cnt -= segsize; } /* * Handle leftover, non-word-aligned bytes. */ if (resid != 0) { uint32_t tmpval, tmpval2; bus_size_t ti_offset; /* * Set the segment pointer. */ CSR_WRITE_4(sc, TI_WINBASE, rounddown2(segptr, TI_WINLEN)); ti_offset = TI_WINDOW + (segptr & (TI_WINLEN - 1)); /* * First, grab whatever is in our source/destination. * We'll obviously need this for reads, but also for * writes, since we'll be doing read/modify/write. */ bus_space_read_region_4(sc->ti_btag, sc->ti_bhandle, ti_offset, &tmpval, 1); /* * Next, translate this from little-endian to big-endian * (at least on i386 boxes). */ tmpval2 = ntohl(tmpval); if (readdata) { /* * If we're reading, just copy the leftover number * of bytes from the host byte order buffer to * the user's buffer. */ if (useraddr) { TI_UNLOCK(sc); copyout(&tmpval2, ptr, resid); TI_LOCK(sc); } else bcopy(&tmpval2, ptr, resid); } else { /* * If we're writing, first copy the bytes to be * written into the network byte order buffer, * leaving the rest of the buffer with whatever was * originally in there. Then, swap the bytes * around into host order and write them out. * * XXX KDM the read side of this has been verified * to work, but the write side of it has not been * verified. So user beware. */ if (useraddr) { TI_UNLOCK(sc); copyin(ptr, &tmpval2, resid); TI_LOCK(sc); } else bcopy(ptr, &tmpval2, resid); tmpval = htonl(tmpval2); bus_space_write_region_4(sc->ti_btag, sc->ti_bhandle, ti_offset, &tmpval, 1); } } CSR_WRITE_4(sc, TI_WINBASE, origwin); return (0); } static int ti_copy_scratch(struct ti_softc *sc, uint32_t tigon_addr, uint32_t len, caddr_t buf, int useraddr, int readdata, int cpu) { uint32_t segptr; int cnt; uint32_t tmpval, tmpval2; caddr_t ptr; TI_LOCK_ASSERT(sc); /* * At the moment, we don't handle non-aligned cases, we just bail. * If this proves to be a problem, it will be fixed. */ if (tigon_addr & 0x3) { device_printf(sc->ti_dev, "%s: tigon address %#x " "isn't word-aligned\n", __func__, tigon_addr); return (EINVAL); } if (len & 0x3) { device_printf(sc->ti_dev, "%s: transfer length %d " "isn't word-aligned\n", __func__, len); return (EINVAL); } segptr = tigon_addr; cnt = len; ptr = buf; while (cnt) { CSR_WRITE_4(sc, CPU_REG(TI_SRAM_ADDR, cpu), segptr); if (readdata) { tmpval2 = CSR_READ_4(sc, CPU_REG(TI_SRAM_DATA, cpu)); tmpval = ntohl(tmpval2); /* * Note: I've used this debugging interface * extensively with Alteon's 12.3.15 firmware, * compiled with GCC 2.7.2.1 and binutils 2.9.1. * * When you compile the firmware without * optimization, which is necessary sometimes in * order to properly step through it, you sometimes * read out a bogus value of 0xc0017c instead of * whatever was supposed to be in that scratchpad * location. That value is on the stack somewhere, * but I've never been able to figure out what was * causing the problem. * * The address seems to pop up in random places, * often not in the same place on two subsequent * reads. * * In any case, the underlying data doesn't seem * to be affected, just the value read out. * * KDM, 3/7/2000 */ if (tmpval2 == 0xc0017c) device_printf(sc->ti_dev, "found 0xc0017c at " "%#x (tmpval2)\n", segptr); if (tmpval == 0xc0017c) device_printf(sc->ti_dev, "found 0xc0017c at " "%#x (tmpval)\n", segptr); if (useraddr) copyout(&tmpval, ptr, 4); else bcopy(&tmpval, ptr, 4); } else { if (useraddr) copyin(ptr, &tmpval2, 4); else bcopy(ptr, &tmpval2, 4); tmpval = htonl(tmpval2); CSR_WRITE_4(sc, CPU_REG(TI_SRAM_DATA, cpu), tmpval); } cnt -= 4; segptr += 4; ptr += 4; } return (0); } static int ti_bcopy_swap(const void *src, void *dst, size_t len, ti_swap_type swap_type) { const uint8_t *tmpsrc; uint8_t *tmpdst; size_t tmplen; if (len & 0x3) { printf("ti_bcopy_swap: length %zd isn't 32-bit aligned\n", len); return (-1); } tmpsrc = src; tmpdst = dst; tmplen = len; while (tmplen) { if (swap_type == TI_SWAP_NTOH) *(uint32_t *)tmpdst = ntohl(*(const uint32_t *)tmpsrc); else *(uint32_t *)tmpdst = htonl(*(const uint32_t *)tmpsrc); tmpsrc += 4; tmpdst += 4; tmplen -= 4; } return (0); } /* * Load firmware image into the NIC. Check that the firmware revision * is acceptable and see if we want the firmware for the Tigon 1 or * Tigon 2. */ static void ti_loadfw(struct ti_softc *sc) { TI_LOCK_ASSERT(sc); switch (sc->ti_hwrev) { case TI_HWREV_TIGON: if (tigonFwReleaseMajor != TI_FIRMWARE_MAJOR || tigonFwReleaseMinor != TI_FIRMWARE_MINOR || tigonFwReleaseFix != TI_FIRMWARE_FIX) { device_printf(sc->ti_dev, "firmware revision mismatch; " "want %d.%d.%d, got %d.%d.%d\n", TI_FIRMWARE_MAJOR, TI_FIRMWARE_MINOR, TI_FIRMWARE_FIX, tigonFwReleaseMajor, tigonFwReleaseMinor, tigonFwReleaseFix); return; } ti_mem_write(sc, tigonFwTextAddr, tigonFwTextLen, tigonFwText); ti_mem_write(sc, tigonFwDataAddr, tigonFwDataLen, tigonFwData); ti_mem_write(sc, tigonFwRodataAddr, tigonFwRodataLen, tigonFwRodata); ti_mem_zero(sc, tigonFwBssAddr, tigonFwBssLen); ti_mem_zero(sc, tigonFwSbssAddr, tigonFwSbssLen); CSR_WRITE_4(sc, TI_CPU_PROGRAM_COUNTER, tigonFwStartAddr); break; case TI_HWREV_TIGON_II: if (tigon2FwReleaseMajor != TI_FIRMWARE_MAJOR || tigon2FwReleaseMinor != TI_FIRMWARE_MINOR || tigon2FwReleaseFix != TI_FIRMWARE_FIX) { device_printf(sc->ti_dev, "firmware revision mismatch; " "want %d.%d.%d, got %d.%d.%d\n", TI_FIRMWARE_MAJOR, TI_FIRMWARE_MINOR, TI_FIRMWARE_FIX, tigon2FwReleaseMajor, tigon2FwReleaseMinor, tigon2FwReleaseFix); return; } ti_mem_write(sc, tigon2FwTextAddr, tigon2FwTextLen, tigon2FwText); ti_mem_write(sc, tigon2FwDataAddr, tigon2FwDataLen, tigon2FwData); ti_mem_write(sc, tigon2FwRodataAddr, tigon2FwRodataLen, tigon2FwRodata); ti_mem_zero(sc, tigon2FwBssAddr, tigon2FwBssLen); ti_mem_zero(sc, tigon2FwSbssAddr, tigon2FwSbssLen); CSR_WRITE_4(sc, TI_CPU_PROGRAM_COUNTER, tigon2FwStartAddr); break; default: device_printf(sc->ti_dev, "can't load firmware: unknown hardware rev\n"); break; } } /* * Send the NIC a command via the command ring. */ static void ti_cmd(struct ti_softc *sc, struct ti_cmd_desc *cmd) { int index; index = sc->ti_cmd_saved_prodidx; CSR_WRITE_4(sc, TI_GCR_CMDRING + (index * 4), *(uint32_t *)(cmd)); TI_INC(index, TI_CMD_RING_CNT); CSR_WRITE_4(sc, TI_MB_CMDPROD_IDX, index); sc->ti_cmd_saved_prodidx = index; } /* * Send the NIC an extended command. The 'len' parameter specifies the * number of command slots to include after the initial command. */ static void ti_cmd_ext(struct ti_softc *sc, struct ti_cmd_desc *cmd, caddr_t arg, int len) { int index; int i; index = sc->ti_cmd_saved_prodidx; CSR_WRITE_4(sc, TI_GCR_CMDRING + (index * 4), *(uint32_t *)(cmd)); TI_INC(index, TI_CMD_RING_CNT); for (i = 0; i < len; i++) { CSR_WRITE_4(sc, TI_GCR_CMDRING + (index * 4), *(uint32_t *)(&arg[i * 4])); TI_INC(index, TI_CMD_RING_CNT); } CSR_WRITE_4(sc, TI_MB_CMDPROD_IDX, index); sc->ti_cmd_saved_prodidx = index; } /* * Handle events that have triggered interrupts. */ static void ti_handle_events(struct ti_softc *sc) { struct ti_event_desc *e; if (sc->ti_rdata.ti_event_ring == NULL) return; bus_dmamap_sync(sc->ti_cdata.ti_event_ring_tag, sc->ti_cdata.ti_event_ring_map, BUS_DMASYNC_POSTREAD); while (sc->ti_ev_saved_considx != sc->ti_ev_prodidx.ti_idx) { e = &sc->ti_rdata.ti_event_ring[sc->ti_ev_saved_considx]; switch (TI_EVENT_EVENT(e)) { case TI_EV_LINKSTAT_CHANGED: sc->ti_linkstat = TI_EVENT_CODE(e); if (sc->ti_linkstat == TI_EV_CODE_LINK_UP) { if_link_state_change(sc->ti_ifp, LINK_STATE_UP); sc->ti_ifp->if_baudrate = IF_Mbps(100); if (bootverbose) device_printf(sc->ti_dev, "10/100 link up\n"); } else if (sc->ti_linkstat == TI_EV_CODE_GIG_LINK_UP) { if_link_state_change(sc->ti_ifp, LINK_STATE_UP); sc->ti_ifp->if_baudrate = IF_Gbps(1UL); if (bootverbose) device_printf(sc->ti_dev, "gigabit link up\n"); } else if (sc->ti_linkstat == TI_EV_CODE_LINK_DOWN) { if_link_state_change(sc->ti_ifp, LINK_STATE_DOWN); sc->ti_ifp->if_baudrate = 0; if (bootverbose) device_printf(sc->ti_dev, "link down\n"); } break; case TI_EV_ERROR: if (TI_EVENT_CODE(e) == TI_EV_CODE_ERR_INVAL_CMD) device_printf(sc->ti_dev, "invalid command\n"); else if (TI_EVENT_CODE(e) == TI_EV_CODE_ERR_UNIMP_CMD) device_printf(sc->ti_dev, "unknown command\n"); else if (TI_EVENT_CODE(e) == TI_EV_CODE_ERR_BADCFG) device_printf(sc->ti_dev, "bad config data\n"); break; case TI_EV_FIRMWARE_UP: ti_init2(sc); break; case TI_EV_STATS_UPDATED: case TI_EV_RESET_JUMBO_RING: case TI_EV_MCAST_UPDATED: /* Who cares. */ break; default: device_printf(sc->ti_dev, "unknown event: %d\n", TI_EVENT_EVENT(e)); break; } /* Advance the consumer index. */ TI_INC(sc->ti_ev_saved_considx, TI_EVENT_RING_CNT); CSR_WRITE_4(sc, TI_GCR_EVENTCONS_IDX, sc->ti_ev_saved_considx); } bus_dmamap_sync(sc->ti_cdata.ti_event_ring_tag, sc->ti_cdata.ti_event_ring_map, BUS_DMASYNC_PREREAD); } struct ti_dmamap_arg { bus_addr_t ti_busaddr; }; static void ti_dma_map_addr(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct ti_dmamap_arg *ctx; if (error) return; KASSERT(nseg == 1, ("%s: %d segments returned!", __func__, nseg)); ctx = arg; ctx->ti_busaddr = segs->ds_addr; } static int ti_dma_ring_alloc(struct ti_softc *sc, bus_size_t alignment, bus_size_t maxsize, bus_dma_tag_t *tag, uint8_t **ring, bus_dmamap_t *map, bus_addr_t *paddr, const char *msg) { struct ti_dmamap_arg ctx; int error; error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, alignment, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, maxsize, 1, maxsize, 0, NULL, NULL, tag); if (error != 0) { device_printf(sc->ti_dev, "could not create %s dma tag\n", msg); return (error); } /* Allocate DMA'able memory for ring. */ error = bus_dmamem_alloc(*tag, (void **)ring, BUS_DMA_NOWAIT | BUS_DMA_ZERO | BUS_DMA_COHERENT, map); if (error != 0) { device_printf(sc->ti_dev, "could not allocate DMA'able memory for %s\n", msg); return (error); } /* Load the address of the ring. */ ctx.ti_busaddr = 0; error = bus_dmamap_load(*tag, *map, *ring, maxsize, ti_dma_map_addr, &ctx, BUS_DMA_NOWAIT); if (error != 0) { device_printf(sc->ti_dev, "could not load DMA'able memory for %s\n", msg); return (error); } *paddr = ctx.ti_busaddr; return (0); } static void ti_dma_ring_free(struct ti_softc *sc, bus_dma_tag_t *tag, uint8_t **ring, bus_dmamap_t map, bus_addr_t *paddr) { if (*paddr != 0) { bus_dmamap_unload(*tag, map); *paddr = 0; } if (*ring != NULL) { bus_dmamem_free(*tag, *ring, map); *ring = NULL; } if (*tag) { bus_dma_tag_destroy(*tag); *tag = NULL; } } static int ti_dma_alloc(struct ti_softc *sc) { bus_addr_t lowaddr; int i, error; lowaddr = BUS_SPACE_MAXADDR; if (sc->ti_dac == 0) lowaddr = BUS_SPACE_MAXADDR_32BIT; error = bus_dma_tag_create(bus_get_dma_tag(sc->ti_dev), 1, 0, lowaddr, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE_32BIT, 0, BUS_SPACE_MAXSIZE_32BIT, 0, NULL, NULL, &sc->ti_cdata.ti_parent_tag); if (error != 0) { device_printf(sc->ti_dev, "could not allocate parent dma tag\n"); return (ENOMEM); } error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, sizeof(struct ti_gib), &sc->ti_cdata.ti_gib_tag, (uint8_t **)&sc->ti_rdata.ti_info, &sc->ti_cdata.ti_gib_map, &sc->ti_rdata.ti_info_paddr, "GIB"); if (error) return (error); /* Producer/consumer status */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, sizeof(struct ti_status), &sc->ti_cdata.ti_status_tag, (uint8_t **)&sc->ti_rdata.ti_status, &sc->ti_cdata.ti_status_map, &sc->ti_rdata.ti_status_paddr, "event ring"); if (error) return (error); /* Event ring */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, TI_EVENT_RING_SZ, &sc->ti_cdata.ti_event_ring_tag, (uint8_t **)&sc->ti_rdata.ti_event_ring, &sc->ti_cdata.ti_event_ring_map, &sc->ti_rdata.ti_event_ring_paddr, "event ring"); if (error) return (error); /* Command ring lives in shared memory so no need to create DMA area. */ /* Standard RX ring */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, TI_STD_RX_RING_SZ, &sc->ti_cdata.ti_rx_std_ring_tag, (uint8_t **)&sc->ti_rdata.ti_rx_std_ring, &sc->ti_cdata.ti_rx_std_ring_map, &sc->ti_rdata.ti_rx_std_ring_paddr, "RX ring"); if (error) return (error); /* Jumbo RX ring */ error = ti_dma_ring_alloc(sc, TI_JUMBO_RING_ALIGN, TI_JUMBO_RX_RING_SZ, &sc->ti_cdata.ti_rx_jumbo_ring_tag, (uint8_t **)&sc->ti_rdata.ti_rx_jumbo_ring, &sc->ti_cdata.ti_rx_jumbo_ring_map, &sc->ti_rdata.ti_rx_jumbo_ring_paddr, "jumbo RX ring"); if (error) return (error); /* RX return ring */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, TI_RX_RETURN_RING_SZ, &sc->ti_cdata.ti_rx_return_ring_tag, (uint8_t **)&sc->ti_rdata.ti_rx_return_ring, &sc->ti_cdata.ti_rx_return_ring_map, &sc->ti_rdata.ti_rx_return_ring_paddr, "RX return ring"); if (error) return (error); /* Create DMA tag for standard RX mbufs. */ error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1, MCLBYTES, 0, NULL, NULL, &sc->ti_cdata.ti_rx_std_tag); if (error) { device_printf(sc->ti_dev, "could not allocate RX dma tag\n"); return (error); } /* Create DMA tag for jumbo RX mbufs. */ #ifdef TI_SF_BUF_JUMBO /* * The VM system will take care of providing aligned pages. Alignment * is set to 1 here so that busdma resources won't be wasted. */ error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, PAGE_SIZE * 4, 4, PAGE_SIZE, 0, NULL, NULL, &sc->ti_cdata.ti_rx_jumbo_tag); #else error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MJUM9BYTES, 1, MJUM9BYTES, 0, NULL, NULL, &sc->ti_cdata.ti_rx_jumbo_tag); #endif if (error) { device_printf(sc->ti_dev, "could not allocate jumbo RX dma tag\n"); return (error); } /* Create DMA tag for TX mbufs. */ error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES * TI_MAXTXSEGS, TI_MAXTXSEGS, MCLBYTES, 0, NULL, NULL, &sc->ti_cdata.ti_tx_tag); if (error) { device_printf(sc->ti_dev, "could not allocate TX dma tag\n"); return (ENOMEM); } /* Create DMA maps for RX buffers. */ for (i = 0; i < TI_STD_RX_RING_CNT; i++) { error = bus_dmamap_create(sc->ti_cdata.ti_rx_std_tag, 0, &sc->ti_cdata.ti_rx_std_maps[i]); if (error) { device_printf(sc->ti_dev, "could not create DMA map for RX\n"); return (error); } } error = bus_dmamap_create(sc->ti_cdata.ti_rx_std_tag, 0, &sc->ti_cdata.ti_rx_std_sparemap); if (error) { device_printf(sc->ti_dev, "could not create spare DMA map for RX\n"); return (error); } /* Create DMA maps for jumbo RX buffers. */ for (i = 0; i < TI_JUMBO_RX_RING_CNT; i++) { error = bus_dmamap_create(sc->ti_cdata.ti_rx_jumbo_tag, 0, &sc->ti_cdata.ti_rx_jumbo_maps[i]); if (error) { device_printf(sc->ti_dev, "could not create DMA map for jumbo RX\n"); return (error); } } error = bus_dmamap_create(sc->ti_cdata.ti_rx_jumbo_tag, 0, &sc->ti_cdata.ti_rx_jumbo_sparemap); if (error) { device_printf(sc->ti_dev, "could not create spare DMA map for jumbo RX\n"); return (error); } /* Create DMA maps for TX buffers. */ for (i = 0; i < TI_TX_RING_CNT; i++) { error = bus_dmamap_create(sc->ti_cdata.ti_tx_tag, 0, &sc->ti_cdata.ti_txdesc[i].tx_dmamap); if (error) { device_printf(sc->ti_dev, "could not create DMA map for TX\n"); return (ENOMEM); } } /* Mini ring and TX ring is not available on Tigon 1. */ if (sc->ti_hwrev == TI_HWREV_TIGON) return (0); /* TX ring */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, TI_TX_RING_SZ, &sc->ti_cdata.ti_tx_ring_tag, (uint8_t **)&sc->ti_rdata.ti_tx_ring, &sc->ti_cdata.ti_tx_ring_map, &sc->ti_rdata.ti_tx_ring_paddr, "TX ring"); if (error) return (error); /* Mini RX ring */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, TI_MINI_RX_RING_SZ, &sc->ti_cdata.ti_rx_mini_ring_tag, (uint8_t **)&sc->ti_rdata.ti_rx_mini_ring, &sc->ti_cdata.ti_rx_mini_ring_map, &sc->ti_rdata.ti_rx_mini_ring_paddr, "mini RX ring"); if (error) return (error); /* Create DMA tag for mini RX mbufs. */ error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MHLEN, 1, MHLEN, 0, NULL, NULL, &sc->ti_cdata.ti_rx_mini_tag); if (error) { device_printf(sc->ti_dev, "could not allocate mini RX dma tag\n"); return (error); } /* Create DMA maps for mini RX buffers. */ for (i = 0; i < TI_MINI_RX_RING_CNT; i++) { error = bus_dmamap_create(sc->ti_cdata.ti_rx_mini_tag, 0, &sc->ti_cdata.ti_rx_mini_maps[i]); if (error) { device_printf(sc->ti_dev, "could not create DMA map for mini RX\n"); return (error); } } error = bus_dmamap_create(sc->ti_cdata.ti_rx_mini_tag, 0, &sc->ti_cdata.ti_rx_mini_sparemap); if (error) { device_printf(sc->ti_dev, "could not create spare DMA map for mini RX\n"); return (error); } return (0); } static void ti_dma_free(struct ti_softc *sc) { int i; /* Destroy DMA maps for RX buffers. */ for (i = 0; i < TI_STD_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_std_maps[i]) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_maps[i]); sc->ti_cdata.ti_rx_std_maps[i] = NULL; } } if (sc->ti_cdata.ti_rx_std_sparemap) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_sparemap); sc->ti_cdata.ti_rx_std_sparemap = NULL; } if (sc->ti_cdata.ti_rx_std_tag) { bus_dma_tag_destroy(sc->ti_cdata.ti_rx_std_tag); sc->ti_cdata.ti_rx_std_tag = NULL; } /* Destroy DMA maps for jumbo RX buffers. */ for (i = 0; i < TI_JUMBO_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_jumbo_maps[i]) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_maps[i]); sc->ti_cdata.ti_rx_jumbo_maps[i] = NULL; } } if (sc->ti_cdata.ti_rx_jumbo_sparemap) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_sparemap); sc->ti_cdata.ti_rx_jumbo_sparemap = NULL; } if (sc->ti_cdata.ti_rx_jumbo_tag) { bus_dma_tag_destroy(sc->ti_cdata.ti_rx_jumbo_tag); sc->ti_cdata.ti_rx_jumbo_tag = NULL; } /* Destroy DMA maps for mini RX buffers. */ for (i = 0; i < TI_MINI_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_mini_maps[i]) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_maps[i]); sc->ti_cdata.ti_rx_mini_maps[i] = NULL; } } if (sc->ti_cdata.ti_rx_mini_sparemap) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_sparemap); sc->ti_cdata.ti_rx_mini_sparemap = NULL; } if (sc->ti_cdata.ti_rx_mini_tag) { bus_dma_tag_destroy(sc->ti_cdata.ti_rx_mini_tag); sc->ti_cdata.ti_rx_mini_tag = NULL; } /* Destroy DMA maps for TX buffers. */ for (i = 0; i < TI_TX_RING_CNT; i++) { if (sc->ti_cdata.ti_txdesc[i].tx_dmamap) { bus_dmamap_destroy(sc->ti_cdata.ti_tx_tag, sc->ti_cdata.ti_txdesc[i].tx_dmamap); sc->ti_cdata.ti_txdesc[i].tx_dmamap = NULL; } } if (sc->ti_cdata.ti_tx_tag) { bus_dma_tag_destroy(sc->ti_cdata.ti_tx_tag); sc->ti_cdata.ti_tx_tag = NULL; } /* Destroy standard RX ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_rx_std_ring_tag, (void *)&sc->ti_rdata.ti_rx_std_ring, sc->ti_cdata.ti_rx_std_ring_map, &sc->ti_rdata.ti_rx_std_ring_paddr); /* Destroy jumbo RX ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_rx_jumbo_ring_tag, (void *)&sc->ti_rdata.ti_rx_jumbo_ring, sc->ti_cdata.ti_rx_jumbo_ring_map, &sc->ti_rdata.ti_rx_jumbo_ring_paddr); /* Destroy mini RX ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_rx_mini_ring_tag, (void *)&sc->ti_rdata.ti_rx_mini_ring, sc->ti_cdata.ti_rx_mini_ring_map, &sc->ti_rdata.ti_rx_mini_ring_paddr); /* Destroy RX return ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_rx_return_ring_tag, (void *)&sc->ti_rdata.ti_rx_return_ring, sc->ti_cdata.ti_rx_return_ring_map, &sc->ti_rdata.ti_rx_return_ring_paddr); /* Destroy TX ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_tx_ring_tag, (void *)&sc->ti_rdata.ti_tx_ring, sc->ti_cdata.ti_tx_ring_map, &sc->ti_rdata.ti_tx_ring_paddr); /* Destroy status block. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_status_tag, (void *)&sc->ti_rdata.ti_status, sc->ti_cdata.ti_status_map, &sc->ti_rdata.ti_status_paddr); /* Destroy event ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_event_ring_tag, (void *)&sc->ti_rdata.ti_event_ring, sc->ti_cdata.ti_event_ring_map, &sc->ti_rdata.ti_event_ring_paddr); /* Destroy GIB */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_gib_tag, (void *)&sc->ti_rdata.ti_info, sc->ti_cdata.ti_gib_map, &sc->ti_rdata.ti_info_paddr); /* Destroy the parent tag. */ if (sc->ti_cdata.ti_parent_tag) { bus_dma_tag_destroy(sc->ti_cdata.ti_parent_tag); sc->ti_cdata.ti_parent_tag = NULL; } } /* * Intialize a standard receive ring descriptor. */ static int ti_newbuf_std(struct ti_softc *sc, int i) { bus_dmamap_t map; bus_dma_segment_t segs[1]; struct mbuf *m; struct ti_rx_desc *r; int error, nsegs; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) return (ENOBUFS); m->m_len = m->m_pkthdr.len = MCLBYTES; m_adj(m, ETHER_ALIGN); error = bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_sparemap, m, segs, &nsegs, 0); if (error != 0) { m_freem(m); return (error); } KASSERT(nsegs == 1, ("%s: %d segments returned!", __func__, nsegs)); if (sc->ti_cdata.ti_rx_std_chain[i] != NULL) { bus_dmamap_sync(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_maps[i], BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_maps[i]); } map = sc->ti_cdata.ti_rx_std_maps[i]; sc->ti_cdata.ti_rx_std_maps[i] = sc->ti_cdata.ti_rx_std_sparemap; sc->ti_cdata.ti_rx_std_sparemap = map; sc->ti_cdata.ti_rx_std_chain[i] = m; r = &sc->ti_rdata.ti_rx_std_ring[i]; ti_hostaddr64(&r->ti_addr, segs[0].ds_addr); r->ti_len = segs[0].ds_len; r->ti_type = TI_BDTYPE_RECV_BD; r->ti_flags = 0; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; bus_dmamap_sync(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_maps[i], BUS_DMASYNC_PREREAD); return (0); } /* * Intialize a mini receive ring descriptor. This only applies to * the Tigon 2. */ static int ti_newbuf_mini(struct ti_softc *sc, int i) { bus_dmamap_t map; bus_dma_segment_t segs[1]; struct mbuf *m; struct ti_rx_desc *r; int error, nsegs; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = m->m_pkthdr.len = MHLEN; m_adj(m, ETHER_ALIGN); error = bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_sparemap, m, segs, &nsegs, 0); if (error != 0) { m_freem(m); return (error); } KASSERT(nsegs == 1, ("%s: %d segments returned!", __func__, nsegs)); if (sc->ti_cdata.ti_rx_mini_chain[i] != NULL) { bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_maps[i], BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_maps[i]); } map = sc->ti_cdata.ti_rx_mini_maps[i]; sc->ti_cdata.ti_rx_mini_maps[i] = sc->ti_cdata.ti_rx_mini_sparemap; sc->ti_cdata.ti_rx_mini_sparemap = map; sc->ti_cdata.ti_rx_mini_chain[i] = m; r = &sc->ti_rdata.ti_rx_mini_ring[i]; ti_hostaddr64(&r->ti_addr, segs[0].ds_addr); r->ti_len = segs[0].ds_len; r->ti_type = TI_BDTYPE_RECV_BD; r->ti_flags = TI_BDFLAG_MINI_RING; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_maps[i], BUS_DMASYNC_PREREAD); return (0); } #ifndef TI_SF_BUF_JUMBO /* * Initialize a jumbo receive ring descriptor. This allocates * a jumbo buffer from the pool managed internally by the driver. */ static int ti_newbuf_jumbo(struct ti_softc *sc, int i, struct mbuf *dummy) { bus_dmamap_t map; bus_dma_segment_t segs[1]; struct mbuf *m; struct ti_rx_desc *r; int error, nsegs; (void)dummy; m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MJUM9BYTES); if (m == NULL) return (ENOBUFS); m->m_len = m->m_pkthdr.len = MJUM9BYTES; m_adj(m, ETHER_ALIGN); error = bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_sparemap, m, segs, &nsegs, 0); if (error != 0) { m_freem(m); return (error); } KASSERT(nsegs == 1, ("%s: %d segments returned!", __func__, nsegs)); if (sc->ti_cdata.ti_rx_jumbo_chain[i] != NULL) { bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_maps[i], BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_maps[i]); } map = sc->ti_cdata.ti_rx_jumbo_maps[i]; sc->ti_cdata.ti_rx_jumbo_maps[i] = sc->ti_cdata.ti_rx_jumbo_sparemap; sc->ti_cdata.ti_rx_jumbo_sparemap = map; sc->ti_cdata.ti_rx_jumbo_chain[i] = m; r = &sc->ti_rdata.ti_rx_jumbo_ring[i]; ti_hostaddr64(&r->ti_addr, segs[0].ds_addr); r->ti_len = segs[0].ds_len; r->ti_type = TI_BDTYPE_RECV_JUMBO_BD; r->ti_flags = TI_BDFLAG_JUMBO_RING; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_maps[i], BUS_DMASYNC_PREREAD); return (0); } #else #if (PAGE_SIZE == 4096) #define NPAYLOAD 2 #else #define NPAYLOAD 1 #endif #define TCP_HDR_LEN (52 + sizeof(struct ether_header)) #define UDP_HDR_LEN (28 + sizeof(struct ether_header)) #define NFS_HDR_LEN (UDP_HDR_LEN) static int HDR_LEN = TCP_HDR_LEN; /* * Initialize a jumbo receive ring descriptor. This allocates * a jumbo buffer from the pool managed internally by the driver. */ static int ti_newbuf_jumbo(struct ti_softc *sc, int idx, struct mbuf *m_old) { bus_dmamap_t map; struct mbuf *cur, *m_new = NULL; struct mbuf *m[3] = {NULL, NULL, NULL}; struct ti_rx_desc_ext *r; vm_page_t frame; /* 1 extra buf to make nobufs easy*/ struct sf_buf *sf[3] = {NULL, NULL, NULL}; int i; bus_dma_segment_t segs[4]; int nsegs; if (m_old != NULL) { m_new = m_old; cur = m_old->m_next; for (i = 0; i <= NPAYLOAD; i++){ m[i] = cur; cur = cur->m_next; } } else { /* Allocate the mbufs. */ MGETHDR(m_new, M_NOWAIT, MT_DATA); if (m_new == NULL) { device_printf(sc->ti_dev, "mbuf allocation failed " "-- packet dropped!\n"); goto nobufs; } MGET(m[NPAYLOAD], M_NOWAIT, MT_DATA); if (m[NPAYLOAD] == NULL) { device_printf(sc->ti_dev, "cluster mbuf allocation " "failed -- packet dropped!\n"); goto nobufs; } if (!(MCLGET(m[NPAYLOAD], M_NOWAIT))) { device_printf(sc->ti_dev, "mbuf allocation failed " "-- packet dropped!\n"); goto nobufs; } m[NPAYLOAD]->m_len = MCLBYTES; for (i = 0; i < NPAYLOAD; i++){ MGET(m[i], M_NOWAIT, MT_DATA); if (m[i] == NULL) { device_printf(sc->ti_dev, "mbuf allocation " "failed -- packet dropped!\n"); goto nobufs; } - frame = vm_page_alloc(NULL, 0, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | + frame = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); if (frame == NULL) { device_printf(sc->ti_dev, "buffer allocation " "failed -- packet dropped!\n"); printf(" index %d page %d\n", idx, i); goto nobufs; } sf[i] = sf_buf_alloc(frame, SFB_NOWAIT); if (sf[i] == NULL) { vm_page_unwire_noq(frame); vm_page_free(frame); device_printf(sc->ti_dev, "buffer allocation " "failed -- packet dropped!\n"); printf(" index %d page %d\n", idx, i); goto nobufs; } } for (i = 0; i < NPAYLOAD; i++){ /* Attach the buffer to the mbuf. */ m[i]->m_data = (void *)sf_buf_kva(sf[i]); m[i]->m_len = PAGE_SIZE; MEXTADD(m[i], sf_buf_kva(sf[i]), PAGE_SIZE, sf_mext_free, (void*)sf_buf_kva(sf[i]), sf[i], 0, EXT_DISPOSABLE); m[i]->m_next = m[i+1]; } /* link the buffers to the header */ m_new->m_next = m[0]; m_new->m_data += ETHER_ALIGN; if (sc->ti_hdrsplit) m_new->m_len = MHLEN - ETHER_ALIGN; else m_new->m_len = HDR_LEN; m_new->m_pkthdr.len = NPAYLOAD * PAGE_SIZE + m_new->m_len; } /* Set up the descriptor. */ r = &sc->ti_rdata.ti_rx_jumbo_ring[idx]; sc->ti_cdata.ti_rx_jumbo_chain[idx] = m_new; map = sc->ti_cdata.ti_rx_jumbo_maps[i]; if (bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_rx_jumbo_tag, map, m_new, segs, &nsegs, 0)) return (ENOBUFS); if ((nsegs < 1) || (nsegs > 4)) return (ENOBUFS); ti_hostaddr64(&r->ti_addr0, segs[0].ds_addr); r->ti_len0 = m_new->m_len; ti_hostaddr64(&r->ti_addr1, segs[1].ds_addr); r->ti_len1 = PAGE_SIZE; ti_hostaddr64(&r->ti_addr2, segs[2].ds_addr); r->ti_len2 = m[1]->m_ext.ext_size; /* could be PAGE_SIZE or MCLBYTES */ if (PAGE_SIZE == 4096) { ti_hostaddr64(&r->ti_addr3, segs[3].ds_addr); r->ti_len3 = MCLBYTES; } else { r->ti_len3 = 0; } r->ti_type = TI_BDTYPE_RECV_JUMBO_BD; r->ti_flags = TI_BDFLAG_JUMBO_RING|TI_RCB_FLAG_USE_EXT_RX_BD; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM|TI_BDFLAG_IP_CKSUM; r->ti_idx = idx; bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_tag, map, BUS_DMASYNC_PREREAD); return (0); nobufs: /* * Warning! : * This can only be called before the mbufs are strung together. * If the mbufs are strung together, m_freem() will free the chain, * so that the later mbufs will be freed multiple times. */ if (m_new) m_freem(m_new); for (i = 0; i < 3; i++) { if (m[i]) m_freem(m[i]); if (sf[i]) sf_mext_free((void *)sf_buf_kva(sf[i]), sf[i]); } return (ENOBUFS); } #endif /* * The standard receive ring has 512 entries in it. At 2K per mbuf cluster, * that's 1MB or memory, which is a lot. For now, we fill only the first * 256 ring entries and hope that our CPU is fast enough to keep up with * the NIC. */ static int ti_init_rx_ring_std(struct ti_softc *sc) { int i; struct ti_cmd_desc cmd; for (i = 0; i < TI_STD_RX_RING_CNT; i++) { if (ti_newbuf_std(sc, i) != 0) return (ENOBUFS); } sc->ti_std = TI_STD_RX_RING_CNT - 1; TI_UPDATE_STDPROD(sc, TI_STD_RX_RING_CNT - 1); return (0); } static void ti_free_rx_ring_std(struct ti_softc *sc) { bus_dmamap_t map; int i; for (i = 0; i < TI_STD_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_std_chain[i] != NULL) { map = sc->ti_cdata.ti_rx_std_maps[i]; bus_dmamap_sync(sc->ti_cdata.ti_rx_std_tag, map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_std_tag, map); m_freem(sc->ti_cdata.ti_rx_std_chain[i]); sc->ti_cdata.ti_rx_std_chain[i] = NULL; } } bzero(sc->ti_rdata.ti_rx_std_ring, TI_STD_RX_RING_SZ); bus_dmamap_sync(sc->ti_cdata.ti_rx_std_ring_tag, sc->ti_cdata.ti_rx_std_ring_map, BUS_DMASYNC_PREWRITE); } static int ti_init_rx_ring_jumbo(struct ti_softc *sc) { struct ti_cmd_desc cmd; int i; for (i = 0; i < TI_JUMBO_RX_RING_CNT; i++) { if (ti_newbuf_jumbo(sc, i, NULL) != 0) return (ENOBUFS); } sc->ti_jumbo = TI_JUMBO_RX_RING_CNT - 1; TI_UPDATE_JUMBOPROD(sc, TI_JUMBO_RX_RING_CNT - 1); return (0); } static void ti_free_rx_ring_jumbo(struct ti_softc *sc) { bus_dmamap_t map; int i; for (i = 0; i < TI_JUMBO_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_jumbo_chain[i] != NULL) { map = sc->ti_cdata.ti_rx_jumbo_maps[i]; bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_tag, map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_jumbo_tag, map); m_freem(sc->ti_cdata.ti_rx_jumbo_chain[i]); sc->ti_cdata.ti_rx_jumbo_chain[i] = NULL; } } bzero(sc->ti_rdata.ti_rx_jumbo_ring, TI_JUMBO_RX_RING_SZ); bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_ring_tag, sc->ti_cdata.ti_rx_jumbo_ring_map, BUS_DMASYNC_PREWRITE); } static int ti_init_rx_ring_mini(struct ti_softc *sc) { int i; for (i = 0; i < TI_MINI_RX_RING_CNT; i++) { if (ti_newbuf_mini(sc, i) != 0) return (ENOBUFS); } sc->ti_mini = TI_MINI_RX_RING_CNT - 1; TI_UPDATE_MINIPROD(sc, TI_MINI_RX_RING_CNT - 1); return (0); } static void ti_free_rx_ring_mini(struct ti_softc *sc) { bus_dmamap_t map; int i; if (sc->ti_rdata.ti_rx_mini_ring == NULL) return; for (i = 0; i < TI_MINI_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_mini_chain[i] != NULL) { map = sc->ti_cdata.ti_rx_mini_maps[i]; bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_tag, map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_mini_tag, map); m_freem(sc->ti_cdata.ti_rx_mini_chain[i]); sc->ti_cdata.ti_rx_mini_chain[i] = NULL; } } bzero(sc->ti_rdata.ti_rx_mini_ring, TI_MINI_RX_RING_SZ); bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_ring_tag, sc->ti_cdata.ti_rx_mini_ring_map, BUS_DMASYNC_PREWRITE); } static void ti_free_tx_ring(struct ti_softc *sc) { struct ti_txdesc *txd; int i; if (sc->ti_rdata.ti_tx_ring == NULL) return; for (i = 0; i < TI_TX_RING_CNT; i++) { txd = &sc->ti_cdata.ti_txdesc[i]; if (txd->tx_m != NULL) { bus_dmamap_sync(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap); m_freem(txd->tx_m); txd->tx_m = NULL; } } bzero(sc->ti_rdata.ti_tx_ring, TI_TX_RING_SZ); bus_dmamap_sync(sc->ti_cdata.ti_tx_ring_tag, sc->ti_cdata.ti_tx_ring_map, BUS_DMASYNC_PREWRITE); } static int ti_init_tx_ring(struct ti_softc *sc) { struct ti_txdesc *txd; int i; STAILQ_INIT(&sc->ti_cdata.ti_txfreeq); STAILQ_INIT(&sc->ti_cdata.ti_txbusyq); for (i = 0; i < TI_TX_RING_CNT; i++) { txd = &sc->ti_cdata.ti_txdesc[i]; STAILQ_INSERT_TAIL(&sc->ti_cdata.ti_txfreeq, txd, tx_q); } sc->ti_txcnt = 0; sc->ti_tx_saved_considx = 0; sc->ti_tx_saved_prodidx = 0; CSR_WRITE_4(sc, TI_MB_SENDPROD_IDX, 0); return (0); } /* * The Tigon 2 firmware has a new way to add/delete multicast addresses, * but we have to support the old way too so that Tigon 1 cards will * work. */ static u_int ti_add_mcast(void *arg, struct sockaddr_dl *sdl, u_int count) { struct ti_softc *sc = arg; struct ti_cmd_desc cmd; uint16_t *m; uint32_t ext[2] = {0, 0}; m = (uint16_t *)LLADDR(sdl); switch (sc->ti_hwrev) { case TI_HWREV_TIGON: CSR_WRITE_4(sc, TI_GCR_MAR0, htons(m[0])); CSR_WRITE_4(sc, TI_GCR_MAR1, (htons(m[1]) << 16) | htons(m[2])); TI_DO_CMD(TI_CMD_ADD_MCAST_ADDR, 0, 0); break; case TI_HWREV_TIGON_II: ext[0] = htons(m[0]); ext[1] = (htons(m[1]) << 16) | htons(m[2]); TI_DO_CMD_EXT(TI_CMD_EXT_ADD_MCAST, 0, 0, (caddr_t)&ext, 2); break; default: device_printf(sc->ti_dev, "unknown hwrev\n"); return (0); } return (1); } static u_int ti_del_mcast(void *arg, struct sockaddr_dl *sdl, u_int count) { struct ti_softc *sc = arg; struct ti_cmd_desc cmd; uint16_t *m; uint32_t ext[2] = {0, 0}; m = (uint16_t *)LLADDR(sdl); switch (sc->ti_hwrev) { case TI_HWREV_TIGON: CSR_WRITE_4(sc, TI_GCR_MAR0, htons(m[0])); CSR_WRITE_4(sc, TI_GCR_MAR1, (htons(m[1]) << 16) | htons(m[2])); TI_DO_CMD(TI_CMD_DEL_MCAST_ADDR, 0, 0); break; case TI_HWREV_TIGON_II: ext[0] = htons(m[0]); ext[1] = (htons(m[1]) << 16) | htons(m[2]); TI_DO_CMD_EXT(TI_CMD_EXT_DEL_MCAST, 0, 0, (caddr_t)&ext, 2); break; default: device_printf(sc->ti_dev, "unknown hwrev\n"); return (0); } return (1); } /* * Configure the Tigon's multicast address filter. * * The actual multicast table management is a bit of a pain, thanks to * slight brain damage on the part of both Alteon and us. With our * multicast code, we are only alerted when the multicast address table * changes and at that point we only have the current list of addresses: * we only know the current state, not the previous state, so we don't * actually know what addresses were removed or added. The firmware has * state, but we can't get our grubby mits on it, and there is no 'delete * all multicast addresses' command. Hence, we have to maintain our own * state so we know what addresses have been programmed into the NIC at * any given time. */ static void ti_setmulti(struct ti_softc *sc) { struct ifnet *ifp; struct ti_cmd_desc cmd; uint32_t intrs; TI_LOCK_ASSERT(sc); ifp = sc->ti_ifp; if (ifp->if_flags & IFF_ALLMULTI) { TI_DO_CMD(TI_CMD_SET_ALLMULTI, TI_CMD_CODE_ALLMULTI_ENB, 0); return; } else { TI_DO_CMD(TI_CMD_SET_ALLMULTI, TI_CMD_CODE_ALLMULTI_DIS, 0); } /* Disable interrupts. */ intrs = CSR_READ_4(sc, TI_MB_HOSTINTR); CSR_WRITE_4(sc, TI_MB_HOSTINTR, 1); /* First, zot all the existing filters. */ if_foreach_llmaddr(ifp, ti_del_mcast, sc); /* Now program new ones. */ if_foreach_llmaddr(ifp, ti_add_mcast, sc); /* Re-enable interrupts. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, intrs); } /* * Check to see if the BIOS has configured us for a 64 bit slot when * we aren't actually in one. If we detect this condition, we can work * around it on the Tigon 2 by setting a bit in the PCI state register, * but for the Tigon 1 we must give up and abort the interface attach. */ static int ti_64bitslot_war(struct ti_softc *sc) { if (!(CSR_READ_4(sc, TI_PCI_STATE) & TI_PCISTATE_32BIT_BUS)) { CSR_WRITE_4(sc, 0x600, 0); CSR_WRITE_4(sc, 0x604, 0); CSR_WRITE_4(sc, 0x600, 0x5555AAAA); if (CSR_READ_4(sc, 0x604) == 0x5555AAAA) { if (sc->ti_hwrev == TI_HWREV_TIGON) return (EINVAL); else { TI_SETBIT(sc, TI_PCI_STATE, TI_PCISTATE_32BIT_BUS); return (0); } } } return (0); } /* * Do endian, PCI and DMA initialization. Also check the on-board ROM * self-test results. */ static int ti_chipinit(struct ti_softc *sc) { uint32_t cacheline; uint32_t pci_writemax = 0; uint32_t hdrsplit; /* Initialize link to down state. */ sc->ti_linkstat = TI_EV_CODE_LINK_DOWN; /* Set endianness before we access any non-PCI registers. */ #if 0 && BYTE_ORDER == BIG_ENDIAN CSR_WRITE_4(sc, TI_MISC_HOST_CTL, TI_MHC_BIGENDIAN_INIT | (TI_MHC_BIGENDIAN_INIT << 24)); #else CSR_WRITE_4(sc, TI_MISC_HOST_CTL, TI_MHC_LITTLEENDIAN_INIT | (TI_MHC_LITTLEENDIAN_INIT << 24)); #endif /* Check the ROM failed bit to see if self-tests passed. */ if (CSR_READ_4(sc, TI_CPU_STATE) & TI_CPUSTATE_ROMFAIL) { device_printf(sc->ti_dev, "board self-diagnostics failed!\n"); return (ENODEV); } /* Halt the CPU. */ TI_SETBIT(sc, TI_CPU_STATE, TI_CPUSTATE_HALT); /* Figure out the hardware revision. */ switch (CSR_READ_4(sc, TI_MISC_HOST_CTL) & TI_MHC_CHIP_REV_MASK) { case TI_REV_TIGON_I: sc->ti_hwrev = TI_HWREV_TIGON; break; case TI_REV_TIGON_II: sc->ti_hwrev = TI_HWREV_TIGON_II; break; default: device_printf(sc->ti_dev, "unsupported chip revision\n"); return (ENODEV); } /* Do special setup for Tigon 2. */ if (sc->ti_hwrev == TI_HWREV_TIGON_II) { TI_SETBIT(sc, TI_CPU_CTL_B, TI_CPUSTATE_HALT); TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_SRAM_BANK_512K); TI_SETBIT(sc, TI_MISC_CONF, TI_MCR_SRAM_SYNCHRONOUS); } /* * We don't have firmware source for the Tigon 1, so Tigon 1 boards * can't do header splitting. */ #ifdef TI_JUMBO_HDRSPLIT if (sc->ti_hwrev != TI_HWREV_TIGON) sc->ti_hdrsplit = 1; else device_printf(sc->ti_dev, "can't do header splitting on a Tigon I board\n"); #endif /* TI_JUMBO_HDRSPLIT */ /* Set up the PCI state register. */ CSR_WRITE_4(sc, TI_PCI_STATE, TI_PCI_READ_CMD|TI_PCI_WRITE_CMD); if (sc->ti_hwrev == TI_HWREV_TIGON_II) { TI_SETBIT(sc, TI_PCI_STATE, TI_PCISTATE_USE_MEM_RD_MULT); } /* Clear the read/write max DMA parameters. */ TI_CLRBIT(sc, TI_PCI_STATE, (TI_PCISTATE_WRITE_MAXDMA| TI_PCISTATE_READ_MAXDMA)); /* Get cache line size. */ cacheline = CSR_READ_4(sc, TI_PCI_BIST) & 0xFF; /* * If the system has set enabled the PCI memory write * and invalidate command in the command register, set * the write max parameter accordingly. This is necessary * to use MWI with the Tigon 2. */ if (CSR_READ_4(sc, TI_PCI_CMDSTAT) & PCIM_CMD_MWIEN) { switch (cacheline) { case 1: case 4: case 8: case 16: case 32: case 64: break; default: /* Disable PCI memory write and invalidate. */ if (bootverbose) device_printf(sc->ti_dev, "cache line size %d" " not supported; disabling PCI MWI\n", cacheline); CSR_WRITE_4(sc, TI_PCI_CMDSTAT, CSR_READ_4(sc, TI_PCI_CMDSTAT) & ~PCIM_CMD_MWIEN); break; } } TI_SETBIT(sc, TI_PCI_STATE, pci_writemax); /* This sets the min dma param all the way up (0xff). */ TI_SETBIT(sc, TI_PCI_STATE, TI_PCISTATE_MINDMA); if (sc->ti_hdrsplit) hdrsplit = TI_OPMODE_JUMBO_HDRSPLIT; else hdrsplit = 0; /* Configure DMA variables. */ #if BYTE_ORDER == BIG_ENDIAN CSR_WRITE_4(sc, TI_GCR_OPMODE, TI_OPMODE_BYTESWAP_BD | TI_OPMODE_BYTESWAP_DATA | TI_OPMODE_WORDSWAP_BD | TI_OPMODE_WARN_ENB | TI_OPMODE_FATAL_ENB | TI_OPMODE_DONT_FRAG_JUMBO | hdrsplit); #else /* BYTE_ORDER */ CSR_WRITE_4(sc, TI_GCR_OPMODE, TI_OPMODE_BYTESWAP_DATA| TI_OPMODE_WORDSWAP_BD|TI_OPMODE_DONT_FRAG_JUMBO| TI_OPMODE_WARN_ENB|TI_OPMODE_FATAL_ENB | hdrsplit); #endif /* BYTE_ORDER */ /* * Only allow 1 DMA channel to be active at a time. * I don't think this is a good idea, but without it * the firmware racks up lots of nicDmaReadRingFull * errors. This is not compatible with hardware checksums. */ if ((sc->ti_ifp->if_capenable & (IFCAP_TXCSUM | IFCAP_RXCSUM)) == 0) TI_SETBIT(sc, TI_GCR_OPMODE, TI_OPMODE_1_DMA_ACTIVE); /* Recommended settings from Tigon manual. */ CSR_WRITE_4(sc, TI_GCR_DMA_WRITECFG, TI_DMA_STATE_THRESH_8W); CSR_WRITE_4(sc, TI_GCR_DMA_READCFG, TI_DMA_STATE_THRESH_8W); if (ti_64bitslot_war(sc)) { device_printf(sc->ti_dev, "bios thinks we're in a 64 bit slot, " "but we aren't"); return (EINVAL); } return (0); } /* * Initialize the general information block and firmware, and * start the CPU(s) running. */ static int ti_gibinit(struct ti_softc *sc) { struct ifnet *ifp; struct ti_rcb *rcb; int i; TI_LOCK_ASSERT(sc); ifp = sc->ti_ifp; /* Disable interrupts for now. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, 1); /* Tell the chip where to find the general information block. */ CSR_WRITE_4(sc, TI_GCR_GENINFO_HI, (uint64_t)sc->ti_rdata.ti_info_paddr >> 32); CSR_WRITE_4(sc, TI_GCR_GENINFO_LO, sc->ti_rdata.ti_info_paddr & 0xFFFFFFFF); /* Load the firmware into SRAM. */ ti_loadfw(sc); /* Set up the contents of the general info and ring control blocks. */ /* Set up the event ring and producer pointer. */ bzero(sc->ti_rdata.ti_event_ring, TI_EVENT_RING_SZ); rcb = &sc->ti_rdata.ti_info->ti_ev_rcb; ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_event_ring_paddr); rcb->ti_flags = 0; ti_hostaddr64(&sc->ti_rdata.ti_info->ti_ev_prodidx_ptr, sc->ti_rdata.ti_status_paddr + offsetof(struct ti_status, ti_ev_prodidx_r)); sc->ti_ev_prodidx.ti_idx = 0; CSR_WRITE_4(sc, TI_GCR_EVENTCONS_IDX, 0); sc->ti_ev_saved_considx = 0; /* Set up the command ring and producer mailbox. */ rcb = &sc->ti_rdata.ti_info->ti_cmd_rcb; ti_hostaddr64(&rcb->ti_hostaddr, TI_GCR_NIC_ADDR(TI_GCR_CMDRING)); rcb->ti_flags = 0; rcb->ti_max_len = 0; for (i = 0; i < TI_CMD_RING_CNT; i++) { CSR_WRITE_4(sc, TI_GCR_CMDRING + (i * 4), 0); } CSR_WRITE_4(sc, TI_GCR_CMDCONS_IDX, 0); CSR_WRITE_4(sc, TI_MB_CMDPROD_IDX, 0); sc->ti_cmd_saved_prodidx = 0; /* * Assign the address of the stats refresh buffer. * We re-use the current stats buffer for this to * conserve memory. */ bzero(&sc->ti_rdata.ti_info->ti_stats, sizeof(struct ti_stats)); ti_hostaddr64(&sc->ti_rdata.ti_info->ti_refresh_stats_ptr, sc->ti_rdata.ti_info_paddr + offsetof(struct ti_gib, ti_stats)); /* Set up the standard receive ring. */ rcb = &sc->ti_rdata.ti_info->ti_std_rx_rcb; ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_rx_std_ring_paddr); rcb->ti_max_len = TI_FRAMELEN; rcb->ti_flags = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) rcb->ti_flags |= TI_RCB_FLAG_TCP_UDP_CKSUM | TI_RCB_FLAG_IP_CKSUM | TI_RCB_FLAG_NO_PHDR_CKSUM; if (sc->ti_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) rcb->ti_flags |= TI_RCB_FLAG_VLAN_ASSIST; /* Set up the jumbo receive ring. */ rcb = &sc->ti_rdata.ti_info->ti_jumbo_rx_rcb; ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_rx_jumbo_ring_paddr); #ifndef TI_SF_BUF_JUMBO rcb->ti_max_len = MJUM9BYTES - ETHER_ALIGN; rcb->ti_flags = 0; #else rcb->ti_max_len = PAGE_SIZE; rcb->ti_flags = TI_RCB_FLAG_USE_EXT_RX_BD; #endif if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) rcb->ti_flags |= TI_RCB_FLAG_TCP_UDP_CKSUM | TI_RCB_FLAG_IP_CKSUM | TI_RCB_FLAG_NO_PHDR_CKSUM; if (sc->ti_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) rcb->ti_flags |= TI_RCB_FLAG_VLAN_ASSIST; /* * Set up the mini ring. Only activated on the * Tigon 2 but the slot in the config block is * still there on the Tigon 1. */ rcb = &sc->ti_rdata.ti_info->ti_mini_rx_rcb; ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_rx_mini_ring_paddr); rcb->ti_max_len = MHLEN - ETHER_ALIGN; if (sc->ti_hwrev == TI_HWREV_TIGON) rcb->ti_flags = TI_RCB_FLAG_RING_DISABLED; else rcb->ti_flags = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) rcb->ti_flags |= TI_RCB_FLAG_TCP_UDP_CKSUM | TI_RCB_FLAG_IP_CKSUM | TI_RCB_FLAG_NO_PHDR_CKSUM; if (sc->ti_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) rcb->ti_flags |= TI_RCB_FLAG_VLAN_ASSIST; /* * Set up the receive return ring. */ rcb = &sc->ti_rdata.ti_info->ti_return_rcb; ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_rx_return_ring_paddr); rcb->ti_flags = 0; rcb->ti_max_len = TI_RETURN_RING_CNT; ti_hostaddr64(&sc->ti_rdata.ti_info->ti_return_prodidx_ptr, sc->ti_rdata.ti_status_paddr + offsetof(struct ti_status, ti_return_prodidx_r)); /* * Set up the tx ring. Note: for the Tigon 2, we have the option * of putting the transmit ring in the host's address space and * letting the chip DMA it instead of leaving the ring in the NIC's * memory and accessing it through the shared memory region. We * do this for the Tigon 2, but it doesn't work on the Tigon 1, * so we have to revert to the shared memory scheme if we detect * a Tigon 1 chip. */ CSR_WRITE_4(sc, TI_WINBASE, TI_TX_RING_BASE); if (sc->ti_rdata.ti_tx_ring != NULL) bzero(sc->ti_rdata.ti_tx_ring, TI_TX_RING_SZ); rcb = &sc->ti_rdata.ti_info->ti_tx_rcb; if (sc->ti_hwrev == TI_HWREV_TIGON) rcb->ti_flags = 0; else rcb->ti_flags = TI_RCB_FLAG_HOST_RING; if (sc->ti_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) rcb->ti_flags |= TI_RCB_FLAG_VLAN_ASSIST; if (sc->ti_ifp->if_capenable & IFCAP_TXCSUM) rcb->ti_flags |= TI_RCB_FLAG_TCP_UDP_CKSUM | TI_RCB_FLAG_IP_CKSUM | TI_RCB_FLAG_NO_PHDR_CKSUM; rcb->ti_max_len = TI_TX_RING_CNT; if (sc->ti_hwrev == TI_HWREV_TIGON) ti_hostaddr64(&rcb->ti_hostaddr, TI_TX_RING_BASE); else ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_tx_ring_paddr); ti_hostaddr64(&sc->ti_rdata.ti_info->ti_tx_considx_ptr, sc->ti_rdata.ti_status_paddr + offsetof(struct ti_status, ti_tx_considx_r)); bus_dmamap_sync(sc->ti_cdata.ti_gib_tag, sc->ti_cdata.ti_gib_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); bus_dmamap_sync(sc->ti_cdata.ti_status_tag, sc->ti_cdata.ti_status_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); bus_dmamap_sync(sc->ti_cdata.ti_event_ring_tag, sc->ti_cdata.ti_event_ring_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); if (sc->ti_rdata.ti_tx_ring != NULL) bus_dmamap_sync(sc->ti_cdata.ti_tx_ring_tag, sc->ti_cdata.ti_tx_ring_map, BUS_DMASYNC_PREWRITE); /* Set up tunables */ #if 0 if (ifp->if_mtu > ETHERMTU + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) CSR_WRITE_4(sc, TI_GCR_RX_COAL_TICKS, (sc->ti_rx_coal_ticks / 10)); else #endif CSR_WRITE_4(sc, TI_GCR_RX_COAL_TICKS, sc->ti_rx_coal_ticks); CSR_WRITE_4(sc, TI_GCR_TX_COAL_TICKS, sc->ti_tx_coal_ticks); CSR_WRITE_4(sc, TI_GCR_STAT_TICKS, sc->ti_stat_ticks); CSR_WRITE_4(sc, TI_GCR_RX_MAX_COAL_BD, sc->ti_rx_max_coal_bds); CSR_WRITE_4(sc, TI_GCR_TX_MAX_COAL_BD, sc->ti_tx_max_coal_bds); CSR_WRITE_4(sc, TI_GCR_TX_BUFFER_RATIO, sc->ti_tx_buf_ratio); /* Turn interrupts on. */ CSR_WRITE_4(sc, TI_GCR_MASK_INTRS, 0); CSR_WRITE_4(sc, TI_MB_HOSTINTR, 0); /* Start CPU. */ TI_CLRBIT(sc, TI_CPU_STATE, (TI_CPUSTATE_HALT|TI_CPUSTATE_STEP)); return (0); } /* * Probe for a Tigon chip. Check the PCI vendor and device IDs * against our list and return its name if we find a match. */ static int ti_probe(device_t dev) { const struct ti_type *t; t = ti_devs; while (t->ti_name != NULL) { if ((pci_get_vendor(dev) == t->ti_vid) && (pci_get_device(dev) == t->ti_did)) { device_set_desc(dev, t->ti_name); return (BUS_PROBE_DEFAULT); } t++; } return (ENXIO); } static int ti_attach(device_t dev) { struct ifnet *ifp; struct ti_softc *sc; int error = 0, rid; u_char eaddr[6]; sc = device_get_softc(dev); sc->ti_dev = dev; mtx_init(&sc->ti_mtx, device_get_nameunit(dev), MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&sc->ti_watchdog, &sc->ti_mtx, 0); ifmedia_init(&sc->ifmedia, IFM_IMASK, ti_ifmedia_upd, ti_ifmedia_sts); ifp = sc->ti_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not if_alloc()\n"); error = ENOSPC; goto fail; } sc->ti_ifp->if_hwassist = TI_CSUM_FEATURES; sc->ti_ifp->if_capabilities = IFCAP_TXCSUM | IFCAP_RXCSUM; sc->ti_ifp->if_capenable = sc->ti_ifp->if_capabilities; /* * Map control/status registers. */ pci_enable_busmaster(dev); rid = PCIR_BAR(0); sc->ti_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->ti_res == NULL) { device_printf(dev, "couldn't map memory\n"); error = ENXIO; goto fail; } sc->ti_btag = rman_get_bustag(sc->ti_res); sc->ti_bhandle = rman_get_bushandle(sc->ti_res); /* Allocate interrupt */ rid = 0; sc->ti_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (sc->ti_irq == NULL) { device_printf(dev, "couldn't map interrupt\n"); error = ENXIO; goto fail; } if (ti_chipinit(sc)) { device_printf(dev, "chip initialization failed\n"); error = ENXIO; goto fail; } /* Zero out the NIC's on-board SRAM. */ ti_mem_zero(sc, 0x2000, 0x100000 - 0x2000); /* Init again -- zeroing memory may have clobbered some registers. */ if (ti_chipinit(sc)) { device_printf(dev, "chip initialization failed\n"); error = ENXIO; goto fail; } /* * Get station address from the EEPROM. Note: the manual states * that the MAC address is at offset 0x8c, however the data is * stored as two longwords (since that's how it's loaded into * the NIC). This means the MAC address is actually preceded * by two zero bytes. We need to skip over those. */ if (ti_read_eeprom(sc, eaddr, TI_EE_MAC_OFFSET + 2, ETHER_ADDR_LEN)) { device_printf(dev, "failed to read station address\n"); error = ENXIO; goto fail; } /* Allocate working area for memory dump. */ sc->ti_membuf = malloc(sizeof(uint8_t) * TI_WINLEN, M_DEVBUF, M_NOWAIT); sc->ti_membuf2 = malloc(sizeof(uint8_t) * TI_WINLEN, M_DEVBUF, M_NOWAIT); if (sc->ti_membuf == NULL || sc->ti_membuf2 == NULL) { device_printf(dev, "cannot allocate memory buffer\n"); error = ENOMEM; goto fail; } if ((error = ti_dma_alloc(sc)) != 0) goto fail; /* * We really need a better way to tell a 1000baseTX card * from a 1000baseSX one, since in theory there could be * OEMed 1000baseTX cards from lame vendors who aren't * clever enough to change the PCI ID. For the moment * though, the AceNIC is the only copper card available. */ if (pci_get_vendor(dev) == ALT_VENDORID && pci_get_device(dev) == ALT_DEVICEID_ACENIC_COPPER) sc->ti_copper = 1; /* Ok, it's not the only copper card available. */ if (pci_get_vendor(dev) == NG_VENDORID && pci_get_device(dev) == NG_DEVICEID_GA620T) sc->ti_copper = 1; /* Set default tunable values. */ ti_sysctl_node(sc); /* Set up ifnet structure */ ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = ti_ioctl; ifp->if_start = ti_start; ifp->if_init = ti_init; ifp->if_get_counter = ti_get_counter; ifp->if_baudrate = IF_Gbps(1UL); ifp->if_snd.ifq_drv_maxlen = TI_TX_RING_CNT - 1; IFQ_SET_MAXLEN(&ifp->if_snd, ifp->if_snd.ifq_drv_maxlen); IFQ_SET_READY(&ifp->if_snd); /* Set up ifmedia support. */ if (sc->ti_copper) { /* * Copper cards allow manual 10/100 mode selection, * but not manual 1000baseTX mode selection. Why? * Because currently there's no way to specify the * master/slave setting through the firmware interface, * so Alteon decided to just bag it and handle it * via autonegotiation. */ ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_10_T, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_10_T|IFM_FDX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_100_TX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_100_TX|IFM_FDX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_1000_T, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_1000_T|IFM_FDX, 0, NULL); } else { /* Fiber cards don't support 10/100 modes. */ ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_1000_SX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_1000_SX|IFM_FDX, 0, NULL); } ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_AUTO, 0, NULL); ifmedia_set(&sc->ifmedia, IFM_ETHER|IFM_AUTO); /* * We're assuming here that card initialization is a sequential * thing. If it isn't, multiple cards probing at the same time * could stomp on the list of softcs here. */ /* Register the device */ sc->dev = make_dev(&ti_cdevsw, device_get_unit(dev), UID_ROOT, GID_OPERATOR, 0600, "ti%d", device_get_unit(dev)); sc->dev->si_drv1 = sc; /* * Call MI attach routine. */ ether_ifattach(ifp, eaddr); /* VLAN capability setup. */ ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWTAGGING; ifp->if_capenable = ifp->if_capabilities; /* Tell the upper layer we support VLAN over-sized frames. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); /* Driver supports link state tracking. */ ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; /* Hook interrupt last to avoid having to lock softc */ error = bus_setup_intr(dev, sc->ti_irq, INTR_TYPE_NET|INTR_MPSAFE, NULL, ti_intr, sc, &sc->ti_intrhand); if (error) { device_printf(dev, "couldn't set up irq\n"); goto fail; } fail: if (error) ti_detach(dev); return (error); } /* * Shutdown hardware and free up resources. This can be called any * time after the mutex has been initialized. It is called in both * the error case in attach and the normal detach case so it needs * to be careful about only freeing resources that have actually been * allocated. */ static int ti_detach(device_t dev) { struct ti_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); if (sc->dev) destroy_dev(sc->dev); KASSERT(mtx_initialized(&sc->ti_mtx), ("ti mutex not initialized")); ifp = sc->ti_ifp; if (device_is_attached(dev)) { ether_ifdetach(ifp); TI_LOCK(sc); ti_stop(sc); TI_UNLOCK(sc); } /* These should only be active if attach succeeded */ callout_drain(&sc->ti_watchdog); bus_generic_detach(dev); ti_dma_free(sc); ifmedia_removeall(&sc->ifmedia); if (sc->ti_intrhand) bus_teardown_intr(dev, sc->ti_irq, sc->ti_intrhand); if (sc->ti_irq) bus_release_resource(dev, SYS_RES_IRQ, 0, sc->ti_irq); if (sc->ti_res) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), sc->ti_res); } if (ifp) if_free(ifp); if (sc->ti_membuf) free(sc->ti_membuf, M_DEVBUF); if (sc->ti_membuf2) free(sc->ti_membuf2, M_DEVBUF); mtx_destroy(&sc->ti_mtx); return (0); } #ifdef TI_JUMBO_HDRSPLIT /* * If hdr_len is 0, that means that header splitting wasn't done on * this packet for some reason. The two most likely reasons are that * the protocol isn't a supported protocol for splitting, or this * packet had a fragment offset that wasn't 0. * * The header length, if it is non-zero, will always be the length of * the headers on the packet, but that length could be longer than the * first mbuf. So we take the minimum of the two as the actual * length. */ static __inline void ti_hdr_split(struct mbuf *top, int hdr_len, int pkt_len, int idx) { int i = 0; int lengths[4] = {0, 0, 0, 0}; struct mbuf *m, *mp; if (hdr_len != 0) top->m_len = min(hdr_len, top->m_len); pkt_len -= top->m_len; lengths[i++] = top->m_len; mp = top; for (m = top->m_next; m && pkt_len; m = m->m_next) { m->m_len = m->m_ext.ext_size = min(m->m_len, pkt_len); pkt_len -= m->m_len; lengths[i++] = m->m_len; mp = m; } #if 0 if (hdr_len != 0) printf("got split packet: "); else printf("got non-split packet: "); printf("%d,%d,%d,%d = %d\n", lengths[0], lengths[1], lengths[2], lengths[3], lengths[0] + lengths[1] + lengths[2] + lengths[3]); #endif if (pkt_len) panic("header splitting didn't"); if (m) { m_freem(m); mp->m_next = NULL; } if (mp->m_next != NULL) panic("ti_hdr_split: last mbuf in chain should be null"); } #endif /* TI_JUMBO_HDRSPLIT */ static void ti_discard_std(struct ti_softc *sc, int i) { struct ti_rx_desc *r; r = &sc->ti_rdata.ti_rx_std_ring[i]; r->ti_len = MCLBYTES - ETHER_ALIGN; r->ti_type = TI_BDTYPE_RECV_BD; r->ti_flags = 0; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; } static void ti_discard_mini(struct ti_softc *sc, int i) { struct ti_rx_desc *r; r = &sc->ti_rdata.ti_rx_mini_ring[i]; r->ti_len = MHLEN - ETHER_ALIGN; r->ti_type = TI_BDTYPE_RECV_BD; r->ti_flags = TI_BDFLAG_MINI_RING; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; } #ifndef TI_SF_BUF_JUMBO static void ti_discard_jumbo(struct ti_softc *sc, int i) { struct ti_rx_desc *r; r = &sc->ti_rdata.ti_rx_jumbo_ring[i]; r->ti_len = MJUM9BYTES - ETHER_ALIGN; r->ti_type = TI_BDTYPE_RECV_JUMBO_BD; r->ti_flags = TI_BDFLAG_JUMBO_RING; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; } #endif /* * Frame reception handling. This is called if there's a frame * on the receive return list. * * Note: we have to be able to handle three possibilities here: * 1) the frame is from the mini receive ring (can only happen) * on Tigon 2 boards) * 2) the frame is from the jumbo receive ring * 3) the frame is from the standard receive ring */ static void ti_rxeof(struct ti_softc *sc) { struct ifnet *ifp; #ifdef TI_SF_BUF_JUMBO bus_dmamap_t map; #endif struct ti_cmd_desc cmd; int jumbocnt, minicnt, stdcnt, ti_len; TI_LOCK_ASSERT(sc); ifp = sc->ti_ifp; bus_dmamap_sync(sc->ti_cdata.ti_rx_std_ring_tag, sc->ti_cdata.ti_rx_std_ring_map, BUS_DMASYNC_POSTWRITE); if (ifp->if_mtu > ETHERMTU + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_ring_tag, sc->ti_cdata.ti_rx_jumbo_ring_map, BUS_DMASYNC_POSTWRITE); if (sc->ti_rdata.ti_rx_mini_ring != NULL) bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_ring_tag, sc->ti_cdata.ti_rx_mini_ring_map, BUS_DMASYNC_POSTWRITE); bus_dmamap_sync(sc->ti_cdata.ti_rx_return_ring_tag, sc->ti_cdata.ti_rx_return_ring_map, BUS_DMASYNC_POSTREAD); jumbocnt = minicnt = stdcnt = 0; while (sc->ti_rx_saved_considx != sc->ti_return_prodidx.ti_idx) { struct ti_rx_desc *cur_rx; uint32_t rxidx; struct mbuf *m = NULL; uint16_t vlan_tag = 0; int have_tag = 0; cur_rx = &sc->ti_rdata.ti_rx_return_ring[sc->ti_rx_saved_considx]; rxidx = cur_rx->ti_idx; ti_len = cur_rx->ti_len; TI_INC(sc->ti_rx_saved_considx, TI_RETURN_RING_CNT); if (cur_rx->ti_flags & TI_BDFLAG_VLAN_TAG) { have_tag = 1; vlan_tag = cur_rx->ti_vlan_tag; } if (cur_rx->ti_flags & TI_BDFLAG_JUMBO_RING) { jumbocnt++; TI_INC(sc->ti_jumbo, TI_JUMBO_RX_RING_CNT); m = sc->ti_cdata.ti_rx_jumbo_chain[rxidx]; #ifndef TI_SF_BUF_JUMBO if (cur_rx->ti_flags & TI_BDFLAG_ERROR) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); ti_discard_jumbo(sc, rxidx); continue; } if (ti_newbuf_jumbo(sc, rxidx, NULL) != 0) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); ti_discard_jumbo(sc, rxidx); continue; } m->m_len = ti_len; #else /* !TI_SF_BUF_JUMBO */ sc->ti_cdata.ti_rx_jumbo_chain[rxidx] = NULL; map = sc->ti_cdata.ti_rx_jumbo_maps[rxidx]; bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_tag, map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_jumbo_tag, map); if (cur_rx->ti_flags & TI_BDFLAG_ERROR) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); ti_newbuf_jumbo(sc, sc->ti_jumbo, m); continue; } if (ti_newbuf_jumbo(sc, sc->ti_jumbo, NULL) == ENOBUFS) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); ti_newbuf_jumbo(sc, sc->ti_jumbo, m); continue; } #ifdef TI_JUMBO_HDRSPLIT if (sc->ti_hdrsplit) ti_hdr_split(m, TI_HOSTADDR(cur_rx->ti_addr), ti_len, rxidx); else #endif /* TI_JUMBO_HDRSPLIT */ m_adj(m, ti_len - m->m_pkthdr.len); #endif /* TI_SF_BUF_JUMBO */ } else if (cur_rx->ti_flags & TI_BDFLAG_MINI_RING) { minicnt++; TI_INC(sc->ti_mini, TI_MINI_RX_RING_CNT); m = sc->ti_cdata.ti_rx_mini_chain[rxidx]; if (cur_rx->ti_flags & TI_BDFLAG_ERROR) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); ti_discard_mini(sc, rxidx); continue; } if (ti_newbuf_mini(sc, rxidx) != 0) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); ti_discard_mini(sc, rxidx); continue; } m->m_len = ti_len; } else { stdcnt++; TI_INC(sc->ti_std, TI_STD_RX_RING_CNT); m = sc->ti_cdata.ti_rx_std_chain[rxidx]; if (cur_rx->ti_flags & TI_BDFLAG_ERROR) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); ti_discard_std(sc, rxidx); continue; } if (ti_newbuf_std(sc, rxidx) != 0) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); ti_discard_std(sc, rxidx); continue; } m->m_len = ti_len; } m->m_pkthdr.len = ti_len; if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); m->m_pkthdr.rcvif = ifp; if (ifp->if_capenable & IFCAP_RXCSUM) { if (cur_rx->ti_flags & TI_BDFLAG_IP_CKSUM) { m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED; if ((cur_rx->ti_ip_cksum ^ 0xffff) == 0) m->m_pkthdr.csum_flags |= CSUM_IP_VALID; } if (cur_rx->ti_flags & TI_BDFLAG_TCP_UDP_CKSUM) { m->m_pkthdr.csum_data = cur_rx->ti_tcp_udp_cksum; m->m_pkthdr.csum_flags |= CSUM_DATA_VALID; } } /* * If we received a packet with a vlan tag, * tag it before passing the packet upward. */ if (have_tag) { m->m_pkthdr.ether_vtag = vlan_tag; m->m_flags |= M_VLANTAG; } TI_UNLOCK(sc); (*ifp->if_input)(ifp, m); TI_LOCK(sc); } bus_dmamap_sync(sc->ti_cdata.ti_rx_return_ring_tag, sc->ti_cdata.ti_rx_return_ring_map, BUS_DMASYNC_PREREAD); /* Only necessary on the Tigon 1. */ if (sc->ti_hwrev == TI_HWREV_TIGON) CSR_WRITE_4(sc, TI_GCR_RXRETURNCONS_IDX, sc->ti_rx_saved_considx); if (stdcnt > 0) { bus_dmamap_sync(sc->ti_cdata.ti_rx_std_ring_tag, sc->ti_cdata.ti_rx_std_ring_map, BUS_DMASYNC_PREWRITE); TI_UPDATE_STDPROD(sc, sc->ti_std); } if (minicnt > 0) { bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_ring_tag, sc->ti_cdata.ti_rx_mini_ring_map, BUS_DMASYNC_PREWRITE); TI_UPDATE_MINIPROD(sc, sc->ti_mini); } if (jumbocnt > 0) { bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_ring_tag, sc->ti_cdata.ti_rx_jumbo_ring_map, BUS_DMASYNC_PREWRITE); TI_UPDATE_JUMBOPROD(sc, sc->ti_jumbo); } } static void ti_txeof(struct ti_softc *sc) { struct ti_txdesc *txd; struct ti_tx_desc txdesc; struct ti_tx_desc *cur_tx = NULL; struct ifnet *ifp; int idx; ifp = sc->ti_ifp; txd = STAILQ_FIRST(&sc->ti_cdata.ti_txbusyq); if (txd == NULL) return; if (sc->ti_rdata.ti_tx_ring != NULL) bus_dmamap_sync(sc->ti_cdata.ti_tx_ring_tag, sc->ti_cdata.ti_tx_ring_map, BUS_DMASYNC_POSTWRITE); /* * Go through our tx ring and free mbufs for those * frames that have been sent. */ for (idx = sc->ti_tx_saved_considx; idx != sc->ti_tx_considx.ti_idx; TI_INC(idx, TI_TX_RING_CNT)) { if (sc->ti_hwrev == TI_HWREV_TIGON) { ti_mem_read(sc, TI_TX_RING_BASE + idx * sizeof(txdesc), sizeof(txdesc), &txdesc); cur_tx = &txdesc; } else cur_tx = &sc->ti_rdata.ti_tx_ring[idx]; sc->ti_txcnt--; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if ((cur_tx->ti_flags & TI_BDFLAG_END) == 0) continue; bus_dmamap_sync(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); m_freem(txd->tx_m); txd->tx_m = NULL; STAILQ_REMOVE_HEAD(&sc->ti_cdata.ti_txbusyq, tx_q); STAILQ_INSERT_TAIL(&sc->ti_cdata.ti_txfreeq, txd, tx_q); txd = STAILQ_FIRST(&sc->ti_cdata.ti_txbusyq); } sc->ti_tx_saved_considx = idx; if (sc->ti_txcnt == 0) sc->ti_timer = 0; } static void ti_intr(void *xsc) { struct ti_softc *sc; struct ifnet *ifp; sc = xsc; TI_LOCK(sc); ifp = sc->ti_ifp; /* Make sure this is really our interrupt. */ if (!(CSR_READ_4(sc, TI_MISC_HOST_CTL) & TI_MHC_INTSTATE)) { TI_UNLOCK(sc); return; } /* Ack interrupt and stop others from occurring. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, 1); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { bus_dmamap_sync(sc->ti_cdata.ti_status_tag, sc->ti_cdata.ti_status_map, BUS_DMASYNC_POSTREAD); /* Check RX return ring producer/consumer */ ti_rxeof(sc); /* Check TX ring producer/consumer */ ti_txeof(sc); bus_dmamap_sync(sc->ti_cdata.ti_status_tag, sc->ti_cdata.ti_status_map, BUS_DMASYNC_PREREAD); } ti_handle_events(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { /* Re-enable interrupts. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, 0); if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ti_start_locked(ifp); } TI_UNLOCK(sc); } static uint64_t ti_get_counter(struct ifnet *ifp, ift_counter cnt) { switch (cnt) { case IFCOUNTER_COLLISIONS: { struct ti_softc *sc; struct ti_stats *s; uint64_t rv; sc = if_getsoftc(ifp); s = &sc->ti_rdata.ti_info->ti_stats; TI_LOCK(sc); bus_dmamap_sync(sc->ti_cdata.ti_gib_tag, sc->ti_cdata.ti_gib_map, BUS_DMASYNC_POSTREAD); rv = s->dot3StatsSingleCollisionFrames + s->dot3StatsMultipleCollisionFrames + s->dot3StatsExcessiveCollisions + s->dot3StatsLateCollisions; bus_dmamap_sync(sc->ti_cdata.ti_gib_tag, sc->ti_cdata.ti_gib_map, BUS_DMASYNC_PREREAD); TI_UNLOCK(sc); return (rv); } default: return (if_get_counter_default(ifp, cnt)); } } /* * Encapsulate an mbuf chain in the tx ring by coupling the mbuf data * pointers to descriptors. */ static int ti_encap(struct ti_softc *sc, struct mbuf **m_head) { struct ti_txdesc *txd; struct ti_tx_desc *f; struct ti_tx_desc txdesc; struct mbuf *m; bus_dma_segment_t txsegs[TI_MAXTXSEGS]; uint16_t csum_flags; int error, frag, i, nseg; if ((txd = STAILQ_FIRST(&sc->ti_cdata.ti_txfreeq)) == NULL) return (ENOBUFS); error = bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap, *m_head, txsegs, &nseg, 0); if (error == EFBIG) { m = m_defrag(*m_head, M_NOWAIT); if (m == NULL) { m_freem(*m_head); *m_head = NULL; return (ENOMEM); } *m_head = m; error = bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap, *m_head, txsegs, &nseg, 0); if (error) { m_freem(*m_head); *m_head = NULL; return (error); } } else if (error != 0) return (error); if (nseg == 0) { m_freem(*m_head); *m_head = NULL; return (EIO); } if (sc->ti_txcnt + nseg >= TI_TX_RING_CNT) { bus_dmamap_unload(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap); return (ENOBUFS); } bus_dmamap_sync(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap, BUS_DMASYNC_PREWRITE); m = *m_head; csum_flags = 0; if (m->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= TI_BDFLAG_IP_CKSUM; if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP)) csum_flags |= TI_BDFLAG_TCP_UDP_CKSUM; frag = sc->ti_tx_saved_prodidx; for (i = 0; i < nseg; i++) { if (sc->ti_hwrev == TI_HWREV_TIGON) { bzero(&txdesc, sizeof(txdesc)); f = &txdesc; } else f = &sc->ti_rdata.ti_tx_ring[frag]; ti_hostaddr64(&f->ti_addr, txsegs[i].ds_addr); f->ti_len = txsegs[i].ds_len; f->ti_flags = csum_flags; if (m->m_flags & M_VLANTAG) { f->ti_flags |= TI_BDFLAG_VLAN_TAG; f->ti_vlan_tag = m->m_pkthdr.ether_vtag; } else { f->ti_vlan_tag = 0; } if (sc->ti_hwrev == TI_HWREV_TIGON) ti_mem_write(sc, TI_TX_RING_BASE + frag * sizeof(txdesc), sizeof(txdesc), &txdesc); TI_INC(frag, TI_TX_RING_CNT); } sc->ti_tx_saved_prodidx = frag; /* set TI_BDFLAG_END on the last descriptor */ frag = (frag + TI_TX_RING_CNT - 1) % TI_TX_RING_CNT; if (sc->ti_hwrev == TI_HWREV_TIGON) { txdesc.ti_flags |= TI_BDFLAG_END; ti_mem_write(sc, TI_TX_RING_BASE + frag * sizeof(txdesc), sizeof(txdesc), &txdesc); } else sc->ti_rdata.ti_tx_ring[frag].ti_flags |= TI_BDFLAG_END; STAILQ_REMOVE_HEAD(&sc->ti_cdata.ti_txfreeq, tx_q); STAILQ_INSERT_TAIL(&sc->ti_cdata.ti_txbusyq, txd, tx_q); txd->tx_m = m; sc->ti_txcnt += nseg; return (0); } static void ti_start(struct ifnet *ifp) { struct ti_softc *sc; sc = ifp->if_softc; TI_LOCK(sc); ti_start_locked(ifp); TI_UNLOCK(sc); } /* * Main transmit routine. To avoid having to do mbuf copies, we put pointers * to the mbuf data regions directly in the transmit descriptors. */ static void ti_start_locked(struct ifnet *ifp) { struct ti_softc *sc; struct mbuf *m_head = NULL; int enq = 0; sc = ifp->if_softc; for (; !IFQ_DRV_IS_EMPTY(&ifp->if_snd) && sc->ti_txcnt < (TI_TX_RING_CNT - 16);) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; /* * Pack the data into the transmit ring. If we * don't have room, set the OACTIVE flag and wait * for the NIC to drain the ring. */ if (ti_encap(sc, &m_head)) { if (m_head == NULL) break; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); ifp->if_drv_flags |= IFF_DRV_OACTIVE; break; } enq++; /* * If there's a BPF listener, bounce a copy of this frame * to him. */ ETHER_BPF_MTAP(ifp, m_head); } if (enq > 0) { if (sc->ti_rdata.ti_tx_ring != NULL) bus_dmamap_sync(sc->ti_cdata.ti_tx_ring_tag, sc->ti_cdata.ti_tx_ring_map, BUS_DMASYNC_PREWRITE); /* Transmit */ CSR_WRITE_4(sc, TI_MB_SENDPROD_IDX, sc->ti_tx_saved_prodidx); /* * Set a timeout in case the chip goes out to lunch. */ sc->ti_timer = 5; } } static void ti_init(void *xsc) { struct ti_softc *sc; sc = xsc; TI_LOCK(sc); ti_init_locked(sc); TI_UNLOCK(sc); } static void ti_init_locked(void *xsc) { struct ti_softc *sc = xsc; if (sc->ti_ifp->if_drv_flags & IFF_DRV_RUNNING) return; /* Cancel pending I/O and flush buffers. */ ti_stop(sc); /* Init the gen info block, ring control blocks and firmware. */ if (ti_gibinit(sc)) { device_printf(sc->ti_dev, "initialization failure\n"); return; } } static void ti_init2(struct ti_softc *sc) { struct ti_cmd_desc cmd; struct ifnet *ifp; uint8_t *ea; struct ifmedia *ifm; int tmp; TI_LOCK_ASSERT(sc); ifp = sc->ti_ifp; /* Specify MTU and interface index. */ CSR_WRITE_4(sc, TI_GCR_IFINDEX, device_get_unit(sc->ti_dev)); CSR_WRITE_4(sc, TI_GCR_IFMTU, ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + ETHER_VLAN_ENCAP_LEN); TI_DO_CMD(TI_CMD_UPDATE_GENCOM, 0, 0); /* Load our MAC address. */ ea = IF_LLADDR(sc->ti_ifp); CSR_WRITE_4(sc, TI_GCR_PAR0, (ea[0] << 8) | ea[1]); CSR_WRITE_4(sc, TI_GCR_PAR1, (ea[2] << 24) | (ea[3] << 16) | (ea[4] << 8) | ea[5]); TI_DO_CMD(TI_CMD_SET_MAC_ADDR, 0, 0); /* Enable or disable promiscuous mode as needed. */ if (ifp->if_flags & IFF_PROMISC) { TI_DO_CMD(TI_CMD_SET_PROMISC_MODE, TI_CMD_CODE_PROMISC_ENB, 0); } else { TI_DO_CMD(TI_CMD_SET_PROMISC_MODE, TI_CMD_CODE_PROMISC_DIS, 0); } /* Program multicast filter. */ ti_setmulti(sc); /* * If this is a Tigon 1, we should tell the * firmware to use software packet filtering. */ if (sc->ti_hwrev == TI_HWREV_TIGON) { TI_DO_CMD(TI_CMD_FDR_FILTERING, TI_CMD_CODE_FILT_ENB, 0); } /* Init RX ring. */ if (ti_init_rx_ring_std(sc) != 0) { /* XXX */ device_printf(sc->ti_dev, "no memory for std Rx buffers.\n"); return; } /* Init jumbo RX ring. */ if (ifp->if_mtu > ETHERMTU + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) { if (ti_init_rx_ring_jumbo(sc) != 0) { /* XXX */ device_printf(sc->ti_dev, "no memory for jumbo Rx buffers.\n"); return; } } /* * If this is a Tigon 2, we can also configure the * mini ring. */ if (sc->ti_hwrev == TI_HWREV_TIGON_II) { if (ti_init_rx_ring_mini(sc) != 0) { /* XXX */ device_printf(sc->ti_dev, "no memory for mini Rx buffers.\n"); return; } } CSR_WRITE_4(sc, TI_GCR_RXRETURNCONS_IDX, 0); sc->ti_rx_saved_considx = 0; /* Init TX ring. */ ti_init_tx_ring(sc); /* Tell firmware we're alive. */ TI_DO_CMD(TI_CMD_HOST_STATE, TI_CMD_CODE_STACK_UP, 0); /* Enable host interrupts. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, 0); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; callout_reset(&sc->ti_watchdog, hz, ti_watchdog, sc); /* * Make sure to set media properly. We have to do this * here since we have to issue commands in order to set * the link negotiation and we can't issue commands until * the firmware is running. */ ifm = &sc->ifmedia; tmp = ifm->ifm_media; ifm->ifm_media = ifm->ifm_cur->ifm_media; ti_ifmedia_upd_locked(sc); ifm->ifm_media = tmp; } /* * Set media options. */ static int ti_ifmedia_upd(struct ifnet *ifp) { struct ti_softc *sc; int error; sc = ifp->if_softc; TI_LOCK(sc); error = ti_ifmedia_upd_locked(sc); TI_UNLOCK(sc); return (error); } static int ti_ifmedia_upd_locked(struct ti_softc *sc) { struct ifmedia *ifm; struct ti_cmd_desc cmd; uint32_t flowctl; ifm = &sc->ifmedia; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); flowctl = 0; switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: /* * Transmit flow control doesn't work on the Tigon 1. */ flowctl = TI_GLNK_RX_FLOWCTL_Y; /* * Transmit flow control can also cause problems on the * Tigon 2, apparently with both the copper and fiber * boards. The symptom is that the interface will just * hang. This was reproduced with Alteon 180 switches. */ #if 0 if (sc->ti_hwrev != TI_HWREV_TIGON) flowctl |= TI_GLNK_TX_FLOWCTL_Y; #endif CSR_WRITE_4(sc, TI_GCR_GLINK, TI_GLNK_PREF|TI_GLNK_1000MB| TI_GLNK_FULL_DUPLEX| flowctl | TI_GLNK_AUTONEGENB|TI_GLNK_ENB); flowctl = TI_LNK_RX_FLOWCTL_Y; #if 0 if (sc->ti_hwrev != TI_HWREV_TIGON) flowctl |= TI_LNK_TX_FLOWCTL_Y; #endif CSR_WRITE_4(sc, TI_GCR_LINK, TI_LNK_100MB|TI_LNK_10MB| TI_LNK_FULL_DUPLEX|TI_LNK_HALF_DUPLEX| flowctl | TI_LNK_AUTONEGENB|TI_LNK_ENB); TI_DO_CMD(TI_CMD_LINK_NEGOTIATION, TI_CMD_CODE_NEGOTIATE_BOTH, 0); break; case IFM_1000_SX: case IFM_1000_T: flowctl = TI_GLNK_RX_FLOWCTL_Y; #if 0 if (sc->ti_hwrev != TI_HWREV_TIGON) flowctl |= TI_GLNK_TX_FLOWCTL_Y; #endif CSR_WRITE_4(sc, TI_GCR_GLINK, TI_GLNK_PREF|TI_GLNK_1000MB| flowctl |TI_GLNK_ENB); CSR_WRITE_4(sc, TI_GCR_LINK, 0); if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) { TI_SETBIT(sc, TI_GCR_GLINK, TI_GLNK_FULL_DUPLEX); } TI_DO_CMD(TI_CMD_LINK_NEGOTIATION, TI_CMD_CODE_NEGOTIATE_GIGABIT, 0); break; case IFM_100_FX: case IFM_10_FL: case IFM_100_TX: case IFM_10_T: flowctl = TI_LNK_RX_FLOWCTL_Y; #if 0 if (sc->ti_hwrev != TI_HWREV_TIGON) flowctl |= TI_LNK_TX_FLOWCTL_Y; #endif CSR_WRITE_4(sc, TI_GCR_GLINK, 0); CSR_WRITE_4(sc, TI_GCR_LINK, TI_LNK_ENB|TI_LNK_PREF|flowctl); if (IFM_SUBTYPE(ifm->ifm_media) == IFM_100_FX || IFM_SUBTYPE(ifm->ifm_media) == IFM_100_TX) { TI_SETBIT(sc, TI_GCR_LINK, TI_LNK_100MB); } else { TI_SETBIT(sc, TI_GCR_LINK, TI_LNK_10MB); } if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) { TI_SETBIT(sc, TI_GCR_LINK, TI_LNK_FULL_DUPLEX); } else { TI_SETBIT(sc, TI_GCR_LINK, TI_LNK_HALF_DUPLEX); } TI_DO_CMD(TI_CMD_LINK_NEGOTIATION, TI_CMD_CODE_NEGOTIATE_10_100, 0); break; } return (0); } /* * Report current media status. */ static void ti_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct ti_softc *sc; uint32_t media = 0; sc = ifp->if_softc; TI_LOCK(sc); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (sc->ti_linkstat == TI_EV_CODE_LINK_DOWN) { TI_UNLOCK(sc); return; } ifmr->ifm_status |= IFM_ACTIVE; if (sc->ti_linkstat == TI_EV_CODE_GIG_LINK_UP) { media = CSR_READ_4(sc, TI_GCR_GLINK_STAT); if (sc->ti_copper) ifmr->ifm_active |= IFM_1000_T; else ifmr->ifm_active |= IFM_1000_SX; if (media & TI_GLNK_FULL_DUPLEX) ifmr->ifm_active |= IFM_FDX; else ifmr->ifm_active |= IFM_HDX; } else if (sc->ti_linkstat == TI_EV_CODE_LINK_UP) { media = CSR_READ_4(sc, TI_GCR_LINK_STAT); if (sc->ti_copper) { if (media & TI_LNK_100MB) ifmr->ifm_active |= IFM_100_TX; if (media & TI_LNK_10MB) ifmr->ifm_active |= IFM_10_T; } else { if (media & TI_LNK_100MB) ifmr->ifm_active |= IFM_100_FX; if (media & TI_LNK_10MB) ifmr->ifm_active |= IFM_10_FL; } if (media & TI_LNK_FULL_DUPLEX) ifmr->ifm_active |= IFM_FDX; if (media & TI_LNK_HALF_DUPLEX) ifmr->ifm_active |= IFM_HDX; } TI_UNLOCK(sc); } static int ti_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ti_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *) data; struct ti_cmd_desc cmd; int mask, error = 0; switch (command) { case SIOCSIFMTU: TI_LOCK(sc); if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > TI_JUMBO_MTU) error = EINVAL; else { ifp->if_mtu = ifr->ifr_mtu; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ti_init_locked(sc); } } TI_UNLOCK(sc); break; case SIOCSIFFLAGS: TI_LOCK(sc); if (ifp->if_flags & IFF_UP) { /* * If only the state of the PROMISC flag changed, * then just use the 'set promisc mode' command * instead of reinitializing the entire NIC. Doing * a full re-init means reloading the firmware and * waiting for it to start up, which may take a * second or two. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING && ifp->if_flags & IFF_PROMISC && !(sc->ti_if_flags & IFF_PROMISC)) { TI_DO_CMD(TI_CMD_SET_PROMISC_MODE, TI_CMD_CODE_PROMISC_ENB, 0); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING && !(ifp->if_flags & IFF_PROMISC) && sc->ti_if_flags & IFF_PROMISC) { TI_DO_CMD(TI_CMD_SET_PROMISC_MODE, TI_CMD_CODE_PROMISC_DIS, 0); } else ti_init_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ti_stop(sc); } } sc->ti_if_flags = ifp->if_flags; TI_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: TI_LOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) ti_setmulti(sc); TI_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->ifmedia, command); break; case SIOCSIFCAP: TI_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if ((mask & IFCAP_TXCSUM) != 0 && (ifp->if_capabilities & IFCAP_TXCSUM) != 0) { ifp->if_capenable ^= IFCAP_TXCSUM; if ((ifp->if_capenable & IFCAP_TXCSUM) != 0) ifp->if_hwassist |= TI_CSUM_FEATURES; else ifp->if_hwassist &= ~TI_CSUM_FEATURES; } if ((mask & IFCAP_RXCSUM) != 0 && (ifp->if_capabilities & IFCAP_RXCSUM) != 0) ifp->if_capenable ^= IFCAP_RXCSUM; if ((mask & IFCAP_VLAN_HWTAGGING) != 0 && (ifp->if_capabilities & IFCAP_VLAN_HWTAGGING) != 0) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if ((mask & IFCAP_VLAN_HWCSUM) != 0 && (ifp->if_capabilities & IFCAP_VLAN_HWCSUM) != 0) ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; if ((mask & (IFCAP_TXCSUM | IFCAP_RXCSUM | IFCAP_VLAN_HWTAGGING)) != 0) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ti_init_locked(sc); } } TI_UNLOCK(sc); VLAN_CAPABILITIES(ifp); break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static int ti_open(struct cdev *dev, int flags, int fmt, struct thread *td) { struct ti_softc *sc; sc = dev->si_drv1; if (sc == NULL) return (ENODEV); TI_LOCK(sc); sc->ti_flags |= TI_FLAG_DEBUGING; TI_UNLOCK(sc); return (0); } static int ti_close(struct cdev *dev, int flag, int fmt, struct thread *td) { struct ti_softc *sc; sc = dev->si_drv1; if (sc == NULL) return (ENODEV); TI_LOCK(sc); sc->ti_flags &= ~TI_FLAG_DEBUGING; TI_UNLOCK(sc); return (0); } /* * This ioctl routine goes along with the Tigon character device. */ static int ti_ioctl2(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct ti_softc *sc; int error; sc = dev->si_drv1; if (sc == NULL) return (ENODEV); error = 0; switch (cmd) { case TIIOCGETSTATS: { struct ti_stats *outstats; outstats = (struct ti_stats *)addr; TI_LOCK(sc); bus_dmamap_sync(sc->ti_cdata.ti_gib_tag, sc->ti_cdata.ti_gib_map, BUS_DMASYNC_POSTREAD); bcopy(&sc->ti_rdata.ti_info->ti_stats, outstats, sizeof(struct ti_stats)); bus_dmamap_sync(sc->ti_cdata.ti_gib_tag, sc->ti_cdata.ti_gib_map, BUS_DMASYNC_PREREAD); TI_UNLOCK(sc); break; } case TIIOCGETPARAMS: { struct ti_params *params; params = (struct ti_params *)addr; TI_LOCK(sc); params->ti_stat_ticks = sc->ti_stat_ticks; params->ti_rx_coal_ticks = sc->ti_rx_coal_ticks; params->ti_tx_coal_ticks = sc->ti_tx_coal_ticks; params->ti_rx_max_coal_bds = sc->ti_rx_max_coal_bds; params->ti_tx_max_coal_bds = sc->ti_tx_max_coal_bds; params->ti_tx_buf_ratio = sc->ti_tx_buf_ratio; params->param_mask = TI_PARAM_ALL; TI_UNLOCK(sc); break; } case TIIOCSETPARAMS: { struct ti_params *params; params = (struct ti_params *)addr; TI_LOCK(sc); if (params->param_mask & TI_PARAM_STAT_TICKS) { sc->ti_stat_ticks = params->ti_stat_ticks; CSR_WRITE_4(sc, TI_GCR_STAT_TICKS, sc->ti_stat_ticks); } if (params->param_mask & TI_PARAM_RX_COAL_TICKS) { sc->ti_rx_coal_ticks = params->ti_rx_coal_ticks; CSR_WRITE_4(sc, TI_GCR_RX_COAL_TICKS, sc->ti_rx_coal_ticks); } if (params->param_mask & TI_PARAM_TX_COAL_TICKS) { sc->ti_tx_coal_ticks = params->ti_tx_coal_ticks; CSR_WRITE_4(sc, TI_GCR_TX_COAL_TICKS, sc->ti_tx_coal_ticks); } if (params->param_mask & TI_PARAM_RX_COAL_BDS) { sc->ti_rx_max_coal_bds = params->ti_rx_max_coal_bds; CSR_WRITE_4(sc, TI_GCR_RX_MAX_COAL_BD, sc->ti_rx_max_coal_bds); } if (params->param_mask & TI_PARAM_TX_COAL_BDS) { sc->ti_tx_max_coal_bds = params->ti_tx_max_coal_bds; CSR_WRITE_4(sc, TI_GCR_TX_MAX_COAL_BD, sc->ti_tx_max_coal_bds); } if (params->param_mask & TI_PARAM_TX_BUF_RATIO) { sc->ti_tx_buf_ratio = params->ti_tx_buf_ratio; CSR_WRITE_4(sc, TI_GCR_TX_BUFFER_RATIO, sc->ti_tx_buf_ratio); } TI_UNLOCK(sc); break; } case TIIOCSETTRACE: { ti_trace_type trace_type; trace_type = *(ti_trace_type *)addr; /* * Set tracing to whatever the user asked for. Setting * this register to 0 should have the effect of disabling * tracing. */ TI_LOCK(sc); CSR_WRITE_4(sc, TI_GCR_NIC_TRACING, trace_type); TI_UNLOCK(sc); break; } case TIIOCGETTRACE: { struct ti_trace_buf *trace_buf; uint32_t trace_start, cur_trace_ptr, trace_len; trace_buf = (struct ti_trace_buf *)addr; TI_LOCK(sc); trace_start = CSR_READ_4(sc, TI_GCR_NICTRACE_START); cur_trace_ptr = CSR_READ_4(sc, TI_GCR_NICTRACE_PTR); trace_len = CSR_READ_4(sc, TI_GCR_NICTRACE_LEN); #if 0 if_printf(sc->ti_ifp, "trace_start = %#x, cur_trace_ptr = %#x, " "trace_len = %d\n", trace_start, cur_trace_ptr, trace_len); if_printf(sc->ti_ifp, "trace_buf->buf_len = %d\n", trace_buf->buf_len); #endif error = ti_copy_mem(sc, trace_start, min(trace_len, trace_buf->buf_len), (caddr_t)trace_buf->buf, 1, 1); if (error == 0) { trace_buf->fill_len = min(trace_len, trace_buf->buf_len); if (cur_trace_ptr < trace_start) trace_buf->cur_trace_ptr = trace_start - cur_trace_ptr; else trace_buf->cur_trace_ptr = cur_trace_ptr - trace_start; } else trace_buf->fill_len = 0; TI_UNLOCK(sc); break; } /* * For debugging, five ioctls are needed: * ALT_ATTACH * ALT_READ_TG_REG * ALT_WRITE_TG_REG * ALT_READ_TG_MEM * ALT_WRITE_TG_MEM */ case ALT_ATTACH: /* * From what I can tell, Alteon's Solaris Tigon driver * only has one character device, so you have to attach * to the Tigon board you're interested in. This seems * like a not-so-good way to do things, since unless you * subsequently specify the unit number of the device * you're interested in every ioctl, you'll only be * able to debug one board at a time. */ break; case ALT_READ_TG_MEM: case ALT_WRITE_TG_MEM: { struct tg_mem *mem_param; uint32_t sram_end, scratch_end; mem_param = (struct tg_mem *)addr; if (sc->ti_hwrev == TI_HWREV_TIGON) { sram_end = TI_END_SRAM_I; scratch_end = TI_END_SCRATCH_I; } else { sram_end = TI_END_SRAM_II; scratch_end = TI_END_SCRATCH_II; } /* * For now, we'll only handle accessing regular SRAM, * nothing else. */ TI_LOCK(sc); if (mem_param->tgAddr >= TI_BEG_SRAM && mem_param->tgAddr + mem_param->len <= sram_end) { /* * In this instance, we always copy to/from user * space, so the user space argument is set to 1. */ error = ti_copy_mem(sc, mem_param->tgAddr, mem_param->len, mem_param->userAddr, 1, cmd == ALT_READ_TG_MEM ? 1 : 0); } else if (mem_param->tgAddr >= TI_BEG_SCRATCH && mem_param->tgAddr <= scratch_end) { error = ti_copy_scratch(sc, mem_param->tgAddr, mem_param->len, mem_param->userAddr, 1, cmd == ALT_READ_TG_MEM ? 1 : 0, TI_PROCESSOR_A); } else if (mem_param->tgAddr >= TI_BEG_SCRATCH_B_DEBUG && mem_param->tgAddr <= TI_BEG_SCRATCH_B_DEBUG) { if (sc->ti_hwrev == TI_HWREV_TIGON) { if_printf(sc->ti_ifp, "invalid memory range for Tigon I\n"); error = EINVAL; break; } error = ti_copy_scratch(sc, mem_param->tgAddr - TI_SCRATCH_DEBUG_OFF, mem_param->len, mem_param->userAddr, 1, cmd == ALT_READ_TG_MEM ? 1 : 0, TI_PROCESSOR_B); } else { if_printf(sc->ti_ifp, "memory address %#x len %d is " "out of supported range\n", mem_param->tgAddr, mem_param->len); error = EINVAL; } TI_UNLOCK(sc); break; } case ALT_READ_TG_REG: case ALT_WRITE_TG_REG: { struct tg_reg *regs; uint32_t tmpval; regs = (struct tg_reg *)addr; /* * Make sure the address in question isn't out of range. */ if (regs->addr > TI_REG_MAX) { error = EINVAL; break; } TI_LOCK(sc); if (cmd == ALT_READ_TG_REG) { bus_space_read_region_4(sc->ti_btag, sc->ti_bhandle, regs->addr, &tmpval, 1); regs->data = ntohl(tmpval); #if 0 if ((regs->addr == TI_CPU_STATE) || (regs->addr == TI_CPU_CTL_B)) { if_printf(sc->ti_ifp, "register %#x = %#x\n", regs->addr, tmpval); } #endif } else { tmpval = htonl(regs->data); bus_space_write_region_4(sc->ti_btag, sc->ti_bhandle, regs->addr, &tmpval, 1); } TI_UNLOCK(sc); break; } default: error = ENOTTY; break; } return (error); } static void ti_watchdog(void *arg) { struct ti_softc *sc; struct ifnet *ifp; sc = arg; TI_LOCK_ASSERT(sc); callout_reset(&sc->ti_watchdog, hz, ti_watchdog, sc); if (sc->ti_timer == 0 || --sc->ti_timer > 0) return; /* * When we're debugging, the chip is often stopped for long periods * of time, and that would normally cause the watchdog timer to fire. * Since that impedes debugging, we don't want to do that. */ if (sc->ti_flags & TI_FLAG_DEBUGING) return; ifp = sc->ti_ifp; if_printf(ifp, "watchdog timeout -- resetting\n"); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ti_init_locked(sc); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* * Stop the adapter and free any mbufs allocated to the * RX and TX lists. */ static void ti_stop(struct ti_softc *sc) { struct ifnet *ifp; struct ti_cmd_desc cmd; TI_LOCK_ASSERT(sc); ifp = sc->ti_ifp; /* Disable host interrupts. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, 1); /* * Tell firmware we're shutting down. */ TI_DO_CMD(TI_CMD_HOST_STATE, TI_CMD_CODE_STACK_DOWN, 0); /* Halt and reinitialize. */ if (ti_chipinit(sc) == 0) { ti_mem_zero(sc, 0x2000, 0x100000 - 0x2000); /* XXX ignore init errors. */ ti_chipinit(sc); } /* Free the RX lists. */ ti_free_rx_ring_std(sc); /* Free jumbo RX list. */ ti_free_rx_ring_jumbo(sc); /* Free mini RX list. */ ti_free_rx_ring_mini(sc); /* Free TX buffers. */ ti_free_tx_ring(sc); sc->ti_ev_prodidx.ti_idx = 0; sc->ti_return_prodidx.ti_idx = 0; sc->ti_tx_considx.ti_idx = 0; sc->ti_tx_saved_considx = TI_TXCONS_UNSET; ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); callout_stop(&sc->ti_watchdog); } /* * Stop all chip I/O so that the kernel's probe routines don't * get confused by errant DMAs when rebooting. */ static int ti_shutdown(device_t dev) { struct ti_softc *sc; sc = device_get_softc(dev); TI_LOCK(sc); ti_chipinit(sc); TI_UNLOCK(sc); return (0); } static void ti_sysctl_node(struct ti_softc *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *child; char tname[32]; ctx = device_get_sysctl_ctx(sc->ti_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->ti_dev)); /* Use DAC */ sc->ti_dac = 1; snprintf(tname, sizeof(tname), "dev.ti.%d.dac", device_get_unit(sc->ti_dev)); TUNABLE_INT_FETCH(tname, &sc->ti_dac); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rx_coal_ticks", CTLFLAG_RW, &sc->ti_rx_coal_ticks, 0, "Receive coalcesced ticks"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rx_max_coal_bds", CTLFLAG_RW, &sc->ti_rx_max_coal_bds, 0, "Receive max coalcesced BDs"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tx_coal_ticks", CTLFLAG_RW, &sc->ti_tx_coal_ticks, 0, "Send coalcesced ticks"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tx_max_coal_bds", CTLFLAG_RW, &sc->ti_tx_max_coal_bds, 0, "Send max coalcesced BDs"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tx_buf_ratio", CTLFLAG_RW, &sc->ti_tx_buf_ratio, 0, "Ratio of NIC memory devoted to TX buffer"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "stat_ticks", CTLFLAG_RW, &sc->ti_stat_ticks, 0, "Number of clock ticks for statistics update interval"); /* Pull in device tunables. */ sc->ti_rx_coal_ticks = 170; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "rx_coal_ticks", &sc->ti_rx_coal_ticks); sc->ti_rx_max_coal_bds = 64; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "rx_max_coal_bds", &sc->ti_rx_max_coal_bds); sc->ti_tx_coal_ticks = TI_TICKS_PER_SEC / 500; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "tx_coal_ticks", &sc->ti_tx_coal_ticks); sc->ti_tx_max_coal_bds = 32; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "tx_max_coal_bds", &sc->ti_tx_max_coal_bds); sc->ti_tx_buf_ratio = 21; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "tx_buf_ratio", &sc->ti_tx_buf_ratio); sc->ti_stat_ticks = 2 * TI_TICKS_PER_SEC; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "stat_ticks", &sc->ti_stat_ticks); } diff --git a/sys/dev/virtio/balloon/virtio_balloon.c b/sys/dev/virtio/balloon/virtio_balloon.c index 3e2d967dd9af..bf4e5cf916f7 100644 --- a/sys/dev/virtio/balloon/virtio_balloon.c +++ b/sys/dev/virtio/balloon/virtio_balloon.c @@ -1,594 +1,593 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011, Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Driver for VirtIO memory balloon devices. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "virtio_if.h" struct vtballoon_softc { device_t vtballoon_dev; struct mtx vtballoon_mtx; uint64_t vtballoon_features; uint32_t vtballoon_flags; #define VTBALLOON_FLAG_DETACH 0x01 struct virtqueue *vtballoon_inflate_vq; struct virtqueue *vtballoon_deflate_vq; uint32_t vtballoon_desired_npages; uint32_t vtballoon_current_npages; TAILQ_HEAD(,vm_page) vtballoon_pages; struct thread *vtballoon_td; uint32_t *vtballoon_page_frames; int vtballoon_timeout; }; static struct virtio_feature_desc vtballoon_feature_desc[] = { { VIRTIO_BALLOON_F_MUST_TELL_HOST, "MustTellHost" }, { VIRTIO_BALLOON_F_STATS_VQ, "StatsVq" }, { VIRTIO_BALLOON_F_DEFLATE_ON_OOM, "DeflateOnOOM" }, { 0, NULL } }; static int vtballoon_probe(device_t); static int vtballoon_attach(device_t); static int vtballoon_detach(device_t); static int vtballoon_config_change(device_t); static int vtballoon_negotiate_features(struct vtballoon_softc *); static int vtballoon_setup_features(struct vtballoon_softc *); static int vtballoon_alloc_virtqueues(struct vtballoon_softc *); static void vtballoon_vq_intr(void *); static void vtballoon_inflate(struct vtballoon_softc *, int); static void vtballoon_deflate(struct vtballoon_softc *, int); static void vtballoon_send_page_frames(struct vtballoon_softc *, struct virtqueue *, int); static void vtballoon_pop(struct vtballoon_softc *); static void vtballoon_stop(struct vtballoon_softc *); static vm_page_t vtballoon_alloc_page(struct vtballoon_softc *); static void vtballoon_free_page(struct vtballoon_softc *, vm_page_t); static int vtballoon_sleep(struct vtballoon_softc *); static void vtballoon_thread(void *); static void vtballoon_setup_sysctl(struct vtballoon_softc *); #define vtballoon_modern(_sc) \ (((_sc)->vtballoon_features & VIRTIO_F_VERSION_1) != 0) /* Features desired/implemented by this driver. */ #define VTBALLOON_FEATURES VIRTIO_BALLOON_F_MUST_TELL_HOST /* Timeout between retries when the balloon needs inflating. */ #define VTBALLOON_LOWMEM_TIMEOUT hz /* * Maximum number of pages we'll request to inflate or deflate * the balloon in one virtqueue request. Both Linux and NetBSD * have settled on 256, doing up to 1MB at a time. */ #define VTBALLOON_PAGES_PER_REQUEST 256 /* Must be able to fix all pages frames in one page (segment). */ CTASSERT(VTBALLOON_PAGES_PER_REQUEST * sizeof(uint32_t) <= PAGE_SIZE); #define VTBALLOON_MTX(_sc) &(_sc)->vtballoon_mtx #define VTBALLOON_LOCK_INIT(_sc, _name) mtx_init(VTBALLOON_MTX((_sc)), _name, \ "VirtIO Balloon Lock", MTX_DEF) #define VTBALLOON_LOCK(_sc) mtx_lock(VTBALLOON_MTX((_sc))) #define VTBALLOON_UNLOCK(_sc) mtx_unlock(VTBALLOON_MTX((_sc))) #define VTBALLOON_LOCK_DESTROY(_sc) mtx_destroy(VTBALLOON_MTX((_sc))) static device_method_t vtballoon_methods[] = { /* Device methods. */ DEVMETHOD(device_probe, vtballoon_probe), DEVMETHOD(device_attach, vtballoon_attach), DEVMETHOD(device_detach, vtballoon_detach), /* VirtIO methods. */ DEVMETHOD(virtio_config_change, vtballoon_config_change), DEVMETHOD_END }; static driver_t vtballoon_driver = { "vtballoon", vtballoon_methods, sizeof(struct vtballoon_softc) }; static devclass_t vtballoon_devclass; VIRTIO_DRIVER_MODULE(virtio_balloon, vtballoon_driver, vtballoon_devclass, 0, 0); MODULE_VERSION(virtio_balloon, 1); MODULE_DEPEND(virtio_balloon, virtio, 1, 1, 1); VIRTIO_SIMPLE_PNPINFO(virtio_balloon, VIRTIO_ID_BALLOON, "VirtIO Balloon Adapter"); static int vtballoon_probe(device_t dev) { return (VIRTIO_SIMPLE_PROBE(dev, virtio_balloon)); } static int vtballoon_attach(device_t dev) { struct vtballoon_softc *sc; int error; sc = device_get_softc(dev); sc->vtballoon_dev = dev; virtio_set_feature_desc(dev, vtballoon_feature_desc); VTBALLOON_LOCK_INIT(sc, device_get_nameunit(dev)); TAILQ_INIT(&sc->vtballoon_pages); vtballoon_setup_sysctl(sc); error = vtballoon_setup_features(sc); if (error) { device_printf(dev, "cannot setup features\n"); goto fail; } sc->vtballoon_page_frames = malloc(VTBALLOON_PAGES_PER_REQUEST * sizeof(uint32_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtballoon_page_frames == NULL) { error = ENOMEM; device_printf(dev, "cannot allocate page frame request array\n"); goto fail; } error = vtballoon_alloc_virtqueues(sc); if (error) { device_printf(dev, "cannot allocate virtqueues\n"); goto fail; } error = virtio_setup_intr(dev, INTR_TYPE_MISC); if (error) { device_printf(dev, "cannot setup virtqueue interrupts\n"); goto fail; } error = kthread_add(vtballoon_thread, sc, NULL, &sc->vtballoon_td, 0, 0, "virtio_balloon"); if (error) { device_printf(dev, "cannot create balloon kthread\n"); goto fail; } virtqueue_enable_intr(sc->vtballoon_inflate_vq); virtqueue_enable_intr(sc->vtballoon_deflate_vq); fail: if (error) vtballoon_detach(dev); return (error); } static int vtballoon_detach(device_t dev) { struct vtballoon_softc *sc; sc = device_get_softc(dev); if (sc->vtballoon_td != NULL) { VTBALLOON_LOCK(sc); sc->vtballoon_flags |= VTBALLOON_FLAG_DETACH; wakeup_one(sc); msleep(sc->vtballoon_td, VTBALLOON_MTX(sc), 0, "vtbdth", 0); VTBALLOON_UNLOCK(sc); sc->vtballoon_td = NULL; } if (device_is_attached(dev)) { vtballoon_pop(sc); vtballoon_stop(sc); } if (sc->vtballoon_page_frames != NULL) { free(sc->vtballoon_page_frames, M_DEVBUF); sc->vtballoon_page_frames = NULL; } VTBALLOON_LOCK_DESTROY(sc); return (0); } static int vtballoon_config_change(device_t dev) { struct vtballoon_softc *sc; sc = device_get_softc(dev); VTBALLOON_LOCK(sc); wakeup_one(sc); VTBALLOON_UNLOCK(sc); return (1); } static int vtballoon_negotiate_features(struct vtballoon_softc *sc) { device_t dev; uint64_t features; dev = sc->vtballoon_dev; features = VTBALLOON_FEATURES; sc->vtballoon_features = virtio_negotiate_features(dev, features); return (virtio_finalize_features(dev)); } static int vtballoon_setup_features(struct vtballoon_softc *sc) { int error; error = vtballoon_negotiate_features(sc); if (error) return (error); return (0); } static int vtballoon_alloc_virtqueues(struct vtballoon_softc *sc) { device_t dev; struct vq_alloc_info vq_info[2]; int nvqs; dev = sc->vtballoon_dev; nvqs = 2; VQ_ALLOC_INFO_INIT(&vq_info[0], 0, vtballoon_vq_intr, sc, &sc->vtballoon_inflate_vq, "%s inflate", device_get_nameunit(dev)); VQ_ALLOC_INFO_INIT(&vq_info[1], 0, vtballoon_vq_intr, sc, &sc->vtballoon_deflate_vq, "%s deflate", device_get_nameunit(dev)); return (virtio_alloc_virtqueues(dev, 0, nvqs, vq_info)); } static void vtballoon_vq_intr(void *xsc) { struct vtballoon_softc *sc; sc = xsc; VTBALLOON_LOCK(sc); wakeup_one(sc); VTBALLOON_UNLOCK(sc); } static void vtballoon_inflate(struct vtballoon_softc *sc, int npages) { struct virtqueue *vq; vm_page_t m; int i; vq = sc->vtballoon_inflate_vq; if (npages > VTBALLOON_PAGES_PER_REQUEST) npages = VTBALLOON_PAGES_PER_REQUEST; for (i = 0; i < npages; i++) { if ((m = vtballoon_alloc_page(sc)) == NULL) { sc->vtballoon_timeout = VTBALLOON_LOWMEM_TIMEOUT; break; } sc->vtballoon_page_frames[i] = VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT; KASSERT(m->a.queue == PQ_NONE, ("%s: allocated page %p on queue", __func__, m)); TAILQ_INSERT_TAIL(&sc->vtballoon_pages, m, plinks.q); } if (i > 0) vtballoon_send_page_frames(sc, vq, i); } static void vtballoon_deflate(struct vtballoon_softc *sc, int npages) { TAILQ_HEAD(, vm_page) free_pages; struct virtqueue *vq; vm_page_t m; int i; vq = sc->vtballoon_deflate_vq; TAILQ_INIT(&free_pages); if (npages > VTBALLOON_PAGES_PER_REQUEST) npages = VTBALLOON_PAGES_PER_REQUEST; for (i = 0; i < npages; i++) { m = TAILQ_FIRST(&sc->vtballoon_pages); KASSERT(m != NULL, ("%s: no more pages to deflate", __func__)); sc->vtballoon_page_frames[i] = VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT; TAILQ_REMOVE(&sc->vtballoon_pages, m, plinks.q); TAILQ_INSERT_TAIL(&free_pages, m, plinks.q); } if (i > 0) { /* Always tell host first before freeing the pages. */ vtballoon_send_page_frames(sc, vq, i); while ((m = TAILQ_FIRST(&free_pages)) != NULL) { TAILQ_REMOVE(&free_pages, m, plinks.q); vtballoon_free_page(sc, m); } } KASSERT((TAILQ_EMPTY(&sc->vtballoon_pages) && sc->vtballoon_current_npages == 0) || (!TAILQ_EMPTY(&sc->vtballoon_pages) && sc->vtballoon_current_npages != 0), ("%s: bogus page count %d", __func__, sc->vtballoon_current_npages)); } static void vtballoon_send_page_frames(struct vtballoon_softc *sc, struct virtqueue *vq, int npages) { struct sglist sg; struct sglist_seg segs[1]; void *c; int error; sglist_init(&sg, 1, segs); error = sglist_append(&sg, sc->vtballoon_page_frames, npages * sizeof(uint32_t)); KASSERT(error == 0, ("error adding page frames to sglist")); error = virtqueue_enqueue(vq, vq, &sg, 1, 0); KASSERT(error == 0, ("error enqueuing page frames to virtqueue")); virtqueue_notify(vq); /* * Inflate and deflate operations are done synchronously. The * interrupt handler will wake us up. */ VTBALLOON_LOCK(sc); while ((c = virtqueue_dequeue(vq, NULL)) == NULL) msleep(sc, VTBALLOON_MTX(sc), 0, "vtbspf", 0); VTBALLOON_UNLOCK(sc); KASSERT(c == vq, ("unexpected balloon operation response")); } static void vtballoon_pop(struct vtballoon_softc *sc) { while (!TAILQ_EMPTY(&sc->vtballoon_pages)) vtballoon_deflate(sc, sc->vtballoon_current_npages); } static void vtballoon_stop(struct vtballoon_softc *sc) { virtqueue_disable_intr(sc->vtballoon_inflate_vq); virtqueue_disable_intr(sc->vtballoon_deflate_vq); virtio_stop(sc->vtballoon_dev); } static vm_page_t vtballoon_alloc_page(struct vtballoon_softc *sc) { vm_page_t m; - m = vm_page_alloc(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP); + m = vm_page_alloc_noobj(VM_ALLOC_NODUMP); if (m != NULL) sc->vtballoon_current_npages++; return (m); } static void vtballoon_free_page(struct vtballoon_softc *sc, vm_page_t m) { vm_page_free(m); sc->vtballoon_current_npages--; } static uint32_t vtballoon_desired_size(struct vtballoon_softc *sc) { uint32_t desired; desired = virtio_read_dev_config_4(sc->vtballoon_dev, offsetof(struct virtio_balloon_config, num_pages)); if (vtballoon_modern(sc)) return (desired); else return (le32toh(desired)); } static void vtballoon_update_size(struct vtballoon_softc *sc) { uint32_t npages; npages = sc->vtballoon_current_npages; if (!vtballoon_modern(sc)) npages = htole32(npages); virtio_write_dev_config_4(sc->vtballoon_dev, offsetof(struct virtio_balloon_config, actual), npages); } static int vtballoon_sleep(struct vtballoon_softc *sc) { int rc, timeout; uint32_t current, desired; rc = 0; current = sc->vtballoon_current_npages; VTBALLOON_LOCK(sc); for (;;) { if (sc->vtballoon_flags & VTBALLOON_FLAG_DETACH) { rc = 1; break; } desired = vtballoon_desired_size(sc); sc->vtballoon_desired_npages = desired; /* * If given, use non-zero timeout on the first time through * the loop. On subsequent times, timeout will be zero so * we will reevaluate the desired size of the balloon and * break out to retry if needed. */ timeout = sc->vtballoon_timeout; sc->vtballoon_timeout = 0; if (current > desired) break; if (current < desired && timeout == 0) break; msleep(sc, VTBALLOON_MTX(sc), 0, "vtbslp", timeout); } VTBALLOON_UNLOCK(sc); return (rc); } static void vtballoon_thread(void *xsc) { struct vtballoon_softc *sc; uint32_t current, desired; sc = xsc; for (;;) { if (vtballoon_sleep(sc) != 0) break; current = sc->vtballoon_current_npages; desired = sc->vtballoon_desired_npages; if (desired != current) { if (desired > current) vtballoon_inflate(sc, desired - current); else vtballoon_deflate(sc, current - desired); vtballoon_update_size(sc); } } kthread_exit(); } static void vtballoon_setup_sysctl(struct vtballoon_softc *sc) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = sc->vtballoon_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "desired", CTLFLAG_RD, &sc->vtballoon_desired_npages, sizeof(uint32_t), "Desired balloon size in pages"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "current", CTLFLAG_RD, &sc->vtballoon_current_npages, sizeof(uint32_t), "Current balloon size in pages"); } diff --git a/sys/dev/xen/balloon/balloon.c b/sys/dev/xen/balloon/balloon.c index 01d75204f5bf..007fada24259 100644 --- a/sys/dev/xen/balloon/balloon.c +++ b/sys/dev/xen/balloon/balloon.c @@ -1,421 +1,414 @@ /****************************************************************************** * balloon.c * * Xen balloon driver - enables returning/claiming memory to/from Xen. * * Copyright (c) 2003, B Dragovic * Copyright (c) 2003-2004, M Williamson, K Fraser * Copyright (c) 2005 Dan M. Smith, IBM Corporation * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BALLOON, "Balloon", "Xen Balloon Driver"); /* Convert from KB (as fetched from xenstore) to number of PAGES */ #define KB_TO_PAGE_SHIFT (PAGE_SHIFT - 10) struct mtx balloon_mutex; /* We increase/decrease in batches which fit in a page */ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)]; struct balloon_stats { /* We aim for 'current allocation' == 'target allocation'. */ unsigned long current_pages; unsigned long target_pages; /* We may hit the hard limit in Xen. If we do then we remember it. */ unsigned long hard_limit; /* * Drivers may alter the memory reservation independently, but they * must inform the balloon driver so we avoid hitting the hard limit. */ unsigned long driver_pages; /* Number of pages in high- and low-memory balloons. */ unsigned long balloon_low; unsigned long balloon_high; }; static struct balloon_stats balloon_stats; #define bs balloon_stats SYSCTL_DECL(_dev_xen); static SYSCTL_NODE(_dev_xen, OID_AUTO, balloon, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Balloon"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, current, CTLFLAG_RD, &bs.current_pages, 0, "Current allocation"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, target, CTLFLAG_RD, &bs.target_pages, 0, "Target allocation"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, driver_pages, CTLFLAG_RD, &bs.driver_pages, 0, "Driver pages"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, hard_limit, CTLFLAG_RD, &bs.hard_limit, 0, "Xen hard limit"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, low_mem, CTLFLAG_RD, &bs.balloon_low, 0, "Low-mem balloon"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, high_mem, CTLFLAG_RD, &bs.balloon_high, 0, "High-mem balloon"); /* List of ballooned pages, threaded through the mem_map array. */ static TAILQ_HEAD(,vm_page) ballooned_pages; /* Main work function, always executed in process context. */ static void balloon_process(void *unused); #define IPRINTK(fmt, args...) \ printk(KERN_INFO "xen_mem: " fmt, ##args) #define WPRINTK(fmt, args...) \ printk(KERN_WARNING "xen_mem: " fmt, ##args) static unsigned long current_target(void) { unsigned long target = min(bs.target_pages, bs.hard_limit); if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high)) target = bs.current_pages + bs.balloon_low + bs.balloon_high; return (target); } static unsigned long minimum_target(void) { unsigned long min_pages, curr_pages = current_target(); #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) /* * Simple continuous piecewiese linear function: * max MiB -> min MiB gradient * 0 0 * 16 16 * 32 24 * 128 72 (1/2) * 512 168 (1/4) * 2048 360 (1/8) * 8192 552 (1/32) * 32768 1320 * 131072 4392 */ if (realmem < MB2PAGES(128)) min_pages = MB2PAGES(8) + (realmem >> 1); else if (realmem < MB2PAGES(512)) min_pages = MB2PAGES(40) + (realmem >> 2); else if (realmem < MB2PAGES(2048)) min_pages = MB2PAGES(104) + (realmem >> 3); else min_pages = MB2PAGES(296) + (realmem >> 5); #undef MB2PAGES /* Don't enforce growth */ return (min(min_pages, curr_pages)); } static int increase_reservation(unsigned long nr_pages) { unsigned long i; vm_page_t page; long rc; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; mtx_assert(&balloon_mutex, MA_OWNED); if (nr_pages > nitems(frame_list)) nr_pages = nitems(frame_list); for (page = TAILQ_FIRST(&ballooned_pages), i = 0; i < nr_pages; i++, page = TAILQ_NEXT(page, plinks.q)) { KASSERT(page != NULL, ("ballooned_pages list corrupt")); frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); } set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op( XENMEM_populate_physmap, &reservation); if (rc < nr_pages) { if (rc > 0) { int ret; /* We hit the Xen hard limit: reprobe. */ reservation.nr_extents = rc; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); KASSERT(ret == rc, ("HYPERVISOR_memory_op failed")); } if (rc >= 0) bs.hard_limit = (bs.current_pages + rc - bs.driver_pages); goto out; } for (i = 0; i < nr_pages; i++) { page = TAILQ_FIRST(&ballooned_pages); KASSERT(page != NULL, ("Unable to get ballooned page")); TAILQ_REMOVE(&ballooned_pages, page, plinks.q); bs.balloon_low--; KASSERT(xen_feature(XENFEAT_auto_translated_physmap), ("auto translated physmap but mapping is valid")); vm_page_free(page); } bs.current_pages += nr_pages; out: return (0); } static int decrease_reservation(unsigned long nr_pages) { unsigned long i; vm_page_t page; int need_sleep = 0; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; mtx_assert(&balloon_mutex, MA_OWNED); if (nr_pages > nitems(frame_list)) nr_pages = nitems(frame_list); for (i = 0; i < nr_pages; i++) { - if ((page = vm_page_alloc(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_ZERO)) == NULL) { + /* + * Zero the page, or else we might be leaking important data to + * other domains on the same host. Xen doesn't scrub ballooned + * out memory pages, the guest is in charge of making sure that + * no information is leaked. + */ + if ((page = vm_page_alloc_noobj(VM_ALLOC_ZERO)) == NULL) { nr_pages = i; need_sleep = 1; break; } - if ((page->flags & PG_ZERO) == 0) { - /* - * Zero the page, or else we might be leaking - * important data to other domains on the same - * host. Xen doesn't scrub ballooned out memory - * pages, the guest is in charge of making - * sure that no information is leaked. - */ - pmap_zero_page(page); - } - frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); TAILQ_INSERT_HEAD(&ballooned_pages, page, plinks.q); bs.balloon_low++; } set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); KASSERT(ret == nr_pages, ("HYPERVISOR_memory_op failed")); bs.current_pages -= nr_pages; return (need_sleep); } /* * We avoid multiple worker processes conflicting via the balloon mutex. * We may of course race updates of the target counts (which are protected * by the balloon lock), or with changes to the Xen hard limit, but we will * recover from these in time. */ static void balloon_process(void *unused) { int need_sleep = 0; long credit; mtx_lock(&balloon_mutex); for (;;) { int sleep_time; do { credit = current_target() - bs.current_pages; if (credit > 0) need_sleep = (increase_reservation(credit) != 0); if (credit < 0) need_sleep = (decrease_reservation(-credit) != 0); } while ((credit != 0) && !need_sleep); /* Schedule more work if there is some still to be done. */ if (current_target() != bs.current_pages) sleep_time = hz; else sleep_time = 0; msleep(balloon_process, &balloon_mutex, 0, "balloon", sleep_time); } mtx_unlock(&balloon_mutex); } /* Resets the Xen limit, sets new target, and kicks off processing. */ static void set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ bs.hard_limit = ~0UL; bs.target_pages = max(target, minimum_target()); wakeup(balloon_process); } static struct xs_watch target_watch = { .node = "memory/target", .max_pending = 1, }; /* React to a change in the target key */ static void watch_target(struct xs_watch *watch, const char **vec, unsigned int len) { unsigned long long new_target; int err; err = xs_scanf(XST_NIL, "memory", "target", NULL, "%llu", &new_target); if (err) { /* This is ok (for domain0 at least) - so just return */ return; } /* * The given memory/target value is in KiB, so it needs converting to * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. */ set_new_target(new_target >> KB_TO_PAGE_SHIFT); } /*------------------ Private Device Attachment Functions --------------------*/ /** * \brief Identify instances of this device type in the system. * * \param driver The driver performing this identify action. * \param parent The NewBus parent device for any devices this method adds. */ static void xenballoon_identify(driver_t *driver __unused, device_t parent) { /* * A single device instance for our driver is always present * in a system operating under Xen. */ BUS_ADD_CHILD(parent, 0, driver->name, 0); } /** * \brief Probe for the existence of the Xen Balloon device * * \param dev NewBus device_t for this Xen control instance. * * \return Always returns 0 indicating success. */ static int xenballoon_probe(device_t dev) { device_set_desc(dev, "Xen Balloon Device"); return (0); } /** * \brief Attach the Xen Balloon device. * * \param dev NewBus device_t for this Xen control instance. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ static int xenballoon_attach(device_t dev) { int err; mtx_init(&balloon_mutex, "balloon_mutex", NULL, MTX_DEF); bs.current_pages = realmem; bs.target_pages = bs.current_pages; bs.balloon_low = 0; bs.balloon_high = 0; bs.driver_pages = 0UL; bs.hard_limit = ~0UL; kproc_create(balloon_process, NULL, NULL, 0, 0, "balloon"); target_watch.callback = watch_target; err = xs_register_watch(&target_watch); if (err) device_printf(dev, "xenballon: failed to set balloon watcher\n"); return (err); } /*-------------------- Private Device Attachment Data -----------------------*/ static device_method_t xenballoon_methods[] = { /* Device interface */ DEVMETHOD(device_identify, xenballoon_identify), DEVMETHOD(device_probe, xenballoon_probe), DEVMETHOD(device_attach, xenballoon_attach), DEVMETHOD_END }; DEFINE_CLASS_0(xenballoon, xenballoon_driver, xenballoon_methods, 0); devclass_t xenballoon_devclass; DRIVER_MODULE(xenballoon, xenstore, xenballoon_driver, xenballoon_devclass, NULL, NULL); diff --git a/sys/dev/xen/gntdev/gntdev.c b/sys/dev/xen/gntdev/gntdev.c index 8249eaba8bcc..0c89133ca1b0 100644 --- a/sys/dev/xen/gntdev/gntdev.c +++ b/sys/dev/xen/gntdev/gntdev.c @@ -1,1289 +1,1282 @@ /*- * Copyright (c) 2016 Akshay Jaggi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * gntdev.c * * Interface to /dev/xen/gntdev. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_GNTDEV, "gntdev", "Xen grant-table user-space device"); #define MAX_OFFSET_COUNT ((0xffffffffffffffffull >> PAGE_SHIFT) + 1) static d_open_t gntdev_open; static d_ioctl_t gntdev_ioctl; static d_mmap_single_t gntdev_mmap_single; static struct cdevsw gntdev_devsw = { .d_version = D_VERSION, .d_open = gntdev_open, .d_ioctl = gntdev_ioctl, .d_mmap_single = gntdev_mmap_single, .d_name = "gntdev", }; static device_t gntdev_dev = NULL; struct gntdev_gref; struct gntdev_gmap; STAILQ_HEAD(gref_list_head, gntdev_gref); STAILQ_HEAD(gmap_list_head, gntdev_gmap); RB_HEAD(gref_tree_head, gntdev_gref); RB_HEAD(gmap_tree_head, gntdev_gmap); struct file_offset_struct { RB_ENTRY(file_offset_struct) next; uint64_t file_offset; uint64_t count; }; static int offset_cmp(struct file_offset_struct *f1, struct file_offset_struct *f2) { return (f1->file_offset - f2->file_offset); } RB_HEAD(file_offset_head, file_offset_struct); RB_GENERATE_STATIC(file_offset_head, file_offset_struct, next, offset_cmp); struct per_user_data { struct mtx user_data_lock; struct gref_tree_head gref_tree; struct gmap_tree_head gmap_tree; struct file_offset_head file_offset; }; /* * Get offset into the file which will be used while mmapping the * appropriate pages by the userspace program. */ static int get_file_offset(struct per_user_data *priv_user, uint32_t count, uint64_t *file_offset) { struct file_offset_struct *offset, *offset_tmp; if (count == 0) return (EINVAL); mtx_lock(&priv_user->user_data_lock); RB_FOREACH_SAFE(offset, file_offset_head, &priv_user->file_offset, offset_tmp) { if (offset->count >= count) { offset->count -= count; *file_offset = offset->file_offset + offset->count * PAGE_SIZE; if (offset->count == 0) { RB_REMOVE(file_offset_head, &priv_user->file_offset, offset); free(offset, M_GNTDEV); } mtx_unlock(&priv_user->user_data_lock); return (0); } } mtx_unlock(&priv_user->user_data_lock); return (ENOSPC); } static void put_file_offset(struct per_user_data *priv_user, uint32_t count, uint64_t file_offset) { struct file_offset_struct *offset, *offset_nxt, *offset_prv; offset = malloc(sizeof(*offset), M_GNTDEV, M_WAITOK | M_ZERO); offset->file_offset = file_offset; offset->count = count; mtx_lock(&priv_user->user_data_lock); RB_INSERT(file_offset_head, &priv_user->file_offset, offset); offset_nxt = RB_NEXT(file_offset_head, &priv_user->file_offset, offset); offset_prv = RB_PREV(file_offset_head, &priv_user->file_offset, offset); if (offset_nxt != NULL && offset_nxt->file_offset == offset->file_offset + offset->count * PAGE_SIZE) { offset->count += offset_nxt->count; RB_REMOVE(file_offset_head, &priv_user->file_offset, offset_nxt); free(offset_nxt, M_GNTDEV); } if (offset_prv != NULL && offset->file_offset == offset_prv->file_offset + offset_prv->count * PAGE_SIZE) { offset_prv->count += offset->count; RB_REMOVE(file_offset_head, &priv_user->file_offset, offset); free(offset, M_GNTDEV); } mtx_unlock(&priv_user->user_data_lock); } static int gntdev_gmap_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color); static void gntdev_gmap_pg_dtor(void *handle); static int gntdev_gmap_pg_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres); static struct cdev_pager_ops gntdev_gmap_pg_ops = { .cdev_pg_fault = gntdev_gmap_pg_fault, .cdev_pg_ctor = gntdev_gmap_pg_ctor, .cdev_pg_dtor = gntdev_gmap_pg_dtor, }; struct cleanup_data_struct { struct mtx to_kill_grefs_mtx; struct mtx to_kill_gmaps_mtx; struct gref_list_head to_kill_grefs; struct gmap_list_head to_kill_gmaps; }; static struct cleanup_data_struct cleanup_data = { .to_kill_grefs = STAILQ_HEAD_INITIALIZER(cleanup_data.to_kill_grefs), .to_kill_gmaps = STAILQ_HEAD_INITIALIZER(cleanup_data.to_kill_gmaps), }; MTX_SYSINIT(to_kill_grefs_mtx, &cleanup_data.to_kill_grefs_mtx, "gntdev to_kill_grefs mutex", MTX_DEF); MTX_SYSINIT(to_kill_gmaps_mtx, &cleanup_data.to_kill_gmaps_mtx, "gntdev to_kill_gmaps mutex", MTX_DEF); static void cleanup_function(void *arg, __unused int pending); static struct task cleanup_task = TASK_INITIALIZER(0, cleanup_function, &cleanup_data); struct notify_data { uint64_t index; uint32_t action; uint32_t event_channel_port; xen_intr_handle_t notify_evtchn_handle; }; static void notify(struct notify_data *notify, vm_page_t page); /*-------------------- Grant Allocation Methods -----------------------------*/ struct gntdev_gref { union gref_next_union { STAILQ_ENTRY(gntdev_gref) list; RB_ENTRY(gntdev_gref) tree; } gref_next; uint64_t file_index; grant_ref_t gref_id; vm_page_t page; struct notify_data *notify; }; static int gref_cmp(struct gntdev_gref *g1, struct gntdev_gref *g2) { return (g1->file_index - g2->file_index); } RB_GENERATE_STATIC(gref_tree_head, gntdev_gref, gref_next.tree, gref_cmp); /* * Traverse over the device-list of to-be-deleted grants allocated, and * if all accesses, both local mmaps and foreign maps, to them have ended, * destroy them. */ static void gref_list_dtor(struct cleanup_data_struct *cleanup_data) { struct gref_list_head tmp_grefs; struct gntdev_gref *gref, *gref_tmp, *gref_previous; STAILQ_INIT(&tmp_grefs); mtx_lock(&cleanup_data->to_kill_grefs_mtx); STAILQ_SWAP(&cleanup_data->to_kill_grefs, &tmp_grefs, gntdev_gref); mtx_unlock(&cleanup_data->to_kill_grefs_mtx); gref_previous = NULL; STAILQ_FOREACH_SAFE(gref, &tmp_grefs, gref_next.list, gref_tmp) { if (gref->page && gref->page->object == NULL) { if (gref->notify) { notify(gref->notify, gref->page); } if (gref->gref_id != GRANT_REF_INVALID) { if (gnttab_query_foreign_access(gref->gref_id)) continue; if (gnttab_end_foreign_access_ref(gref->gref_id) == 0) continue; gnttab_free_grant_reference(gref->gref_id); } vm_page_unwire_noq(gref->page); vm_page_free(gref->page); gref->page = NULL; } if (gref->page == NULL) { if (gref_previous == NULL) STAILQ_REMOVE_HEAD(&tmp_grefs, gref_next.list); else STAILQ_REMOVE_AFTER(&tmp_grefs, gref_previous, gref_next.list); if (gref->notify) free(gref->notify, M_GNTDEV); free(gref, M_GNTDEV); } else gref_previous = gref; } if (!STAILQ_EMPTY(&tmp_grefs)) { mtx_lock(&cleanup_data->to_kill_grefs_mtx); STAILQ_CONCAT(&cleanup_data->to_kill_grefs, &tmp_grefs); mtx_unlock(&cleanup_data->to_kill_grefs_mtx); } } /* * Find count number of contiguous allocated grants for a given userspace * program by file-offset (index). */ static struct gntdev_gref* gntdev_find_grefs(struct per_user_data *priv_user, uint64_t index, uint32_t count) { struct gntdev_gref find_gref, *gref, *gref_start = NULL; find_gref.file_index = index; mtx_lock(&priv_user->user_data_lock); gref_start = RB_FIND(gref_tree_head, &priv_user->gref_tree, &find_gref); for (gref = gref_start; gref != NULL && count > 0; gref = RB_NEXT(gref_tree_head, &priv_user->gref_tree, gref)) { if (index != gref->file_index) break; index += PAGE_SIZE; count--; } mtx_unlock(&priv_user->user_data_lock); if (count) return (NULL); return (gref_start); } /* * IOCTL_GNTDEV_ALLOC_GREF * Allocate required number of wired pages for the request, grant foreign * access to the physical frames for these pages, and add details about * this allocation to the per user private data, so that these pages can * be mmapped by the userspace program. */ static int gntdev_alloc_gref(struct ioctl_gntdev_alloc_gref *arg) { uint32_t i; int error, readonly; uint64_t file_offset; struct gntdev_gref *grefs; struct per_user_data *priv_user; readonly = !(arg->flags & GNTDEV_ALLOC_FLAG_WRITABLE); error = devfs_get_cdevpriv((void**) &priv_user); if (error != 0) return (EINVAL); /* Cleanup grefs and free pages. */ taskqueue_enqueue(taskqueue_thread, &cleanup_task); /* Get file offset for this request. */ error = get_file_offset(priv_user, arg->count, &file_offset); if (error != 0) return (error); /* Allocate grefs. */ grefs = malloc(sizeof(*grefs) * arg->count, M_GNTDEV, M_WAITOK); for (i = 0; i < arg->count; i++) { grefs[i].file_index = file_offset + i * PAGE_SIZE; grefs[i].gref_id = GRANT_REF_INVALID; grefs[i].notify = NULL; - grefs[i].page = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL - | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); + grefs[i].page = vm_page_alloc_noobj(VM_ALLOC_WIRED | + VM_ALLOC_ZERO); if (grefs[i].page == NULL) { log(LOG_ERR, "Page allocation failed."); error = ENOMEM; break; } - if ((grefs[i].page->flags & PG_ZERO) == 0) { - /* - * Zero the allocated page, as we don't want to - * leak our memory to other domains. - */ - pmap_zero_page(grefs[i].page); - } grefs[i].page->valid = VM_PAGE_BITS_ALL; error = gnttab_grant_foreign_access(arg->domid, (VM_PAGE_TO_PHYS(grefs[i].page) >> PAGE_SHIFT), readonly, &grefs[i].gref_id); if (error != 0) { log(LOG_ERR, "Grant Table Hypercall failed."); break; } } if (error != 0) { /* * If target domain maps the gref (by guessing the gref-id), * then we can't clean it up yet and we have to leave the * page in place so as to not leak our memory to that domain. * Add it to a global list to be cleaned up later. */ mtx_lock(&cleanup_data.to_kill_grefs_mtx); for (i = 0; i < arg->count; i++) STAILQ_INSERT_TAIL(&cleanup_data.to_kill_grefs, &grefs[i], gref_next.list); mtx_unlock(&cleanup_data.to_kill_grefs_mtx); taskqueue_enqueue(taskqueue_thread, &cleanup_task); return (error); } /* Copy the output values. */ arg->index = file_offset; for (i = 0; i < arg->count; i++) suword32(&arg->gref_ids[i], grefs[i].gref_id); /* Modify the per user private data. */ mtx_lock(&priv_user->user_data_lock); for (i = 0; i < arg->count; i++) RB_INSERT(gref_tree_head, &priv_user->gref_tree, &grefs[i]); mtx_unlock(&priv_user->user_data_lock); return (error); } /* * IOCTL_GNTDEV_DEALLOC_GREF * Remove grant allocation information from the per user private data, so * that it can't be mmapped anymore by the userspace program, and add it * to the to-be-deleted grants global device-list. */ static int gntdev_dealloc_gref(struct ioctl_gntdev_dealloc_gref *arg) { int error; uint32_t count; struct gntdev_gref *gref, *gref_tmp; struct per_user_data *priv_user; error = devfs_get_cdevpriv((void**) &priv_user); if (error != 0) return (EINVAL); gref = gntdev_find_grefs(priv_user, arg->index, arg->count); if (gref == NULL) { log(LOG_ERR, "Can't find requested grant-refs."); return (EINVAL); } /* Remove the grefs from user private data. */ count = arg->count; mtx_lock(&priv_user->user_data_lock); mtx_lock(&cleanup_data.to_kill_grefs_mtx); for (; gref != NULL && count > 0; gref = gref_tmp) { gref_tmp = RB_NEXT(gref_tree_head, &priv_user->gref_tree, gref); RB_REMOVE(gref_tree_head, &priv_user->gref_tree, gref); STAILQ_INSERT_TAIL(&cleanup_data.to_kill_grefs, gref, gref_next.list); count--; } mtx_unlock(&cleanup_data.to_kill_grefs_mtx); mtx_unlock(&priv_user->user_data_lock); taskqueue_enqueue(taskqueue_thread, &cleanup_task); put_file_offset(priv_user, arg->count, arg->index); return (0); } /*-------------------- Grant Mapping Methods --------------------------------*/ struct gntdev_gmap_map { vm_object_t mem; struct resource *pseudo_phys_res; int pseudo_phys_res_id; vm_paddr_t phys_base_addr; }; struct gntdev_gmap { union gmap_next_union { STAILQ_ENTRY(gntdev_gmap) list; RB_ENTRY(gntdev_gmap) tree; } gmap_next; uint64_t file_index; uint32_t count; struct gnttab_map_grant_ref *grant_map_ops; struct gntdev_gmap_map *map; struct notify_data *notify; }; static int gmap_cmp(struct gntdev_gmap *g1, struct gntdev_gmap *g2) { return (g1->file_index - g2->file_index); } RB_GENERATE_STATIC(gmap_tree_head, gntdev_gmap, gmap_next.tree, gmap_cmp); /* * Traverse over the device-list of to-be-deleted grant mappings, and if * the region is no longer mmapped by anyone, free the memory used to * store information about the mapping. */ static void gmap_list_dtor(struct cleanup_data_struct *cleanup_data) { struct gmap_list_head tmp_gmaps; struct gntdev_gmap *gmap, *gmap_tmp, *gmap_previous; STAILQ_INIT(&tmp_gmaps); mtx_lock(&cleanup_data->to_kill_gmaps_mtx); STAILQ_SWAP(&cleanup_data->to_kill_gmaps, &tmp_gmaps, gntdev_gmap); mtx_unlock(&cleanup_data->to_kill_gmaps_mtx); gmap_previous = NULL; STAILQ_FOREACH_SAFE(gmap, &tmp_gmaps, gmap_next.list, gmap_tmp) { if (gmap->map == NULL) { if (gmap_previous == NULL) STAILQ_REMOVE_HEAD(&tmp_gmaps, gmap_next.list); else STAILQ_REMOVE_AFTER(&tmp_gmaps, gmap_previous, gmap_next.list); if (gmap->notify) free(gmap->notify, M_GNTDEV); free(gmap->grant_map_ops, M_GNTDEV); free(gmap, M_GNTDEV); } else gmap_previous = gmap; } if (!STAILQ_EMPTY(&tmp_gmaps)) { mtx_lock(&cleanup_data->to_kill_gmaps_mtx); STAILQ_CONCAT(&cleanup_data->to_kill_gmaps, &tmp_gmaps); mtx_unlock(&cleanup_data->to_kill_gmaps_mtx); } } /* * Find mapped grants for a given userspace program, by file-offset (index) * and count, as supplied during the map-ioctl. */ static struct gntdev_gmap* gntdev_find_gmap(struct per_user_data *priv_user, uint64_t index, uint32_t count) { struct gntdev_gmap find_gmap, *gmap; find_gmap.file_index = index; mtx_lock(&priv_user->user_data_lock); gmap = RB_FIND(gmap_tree_head, &priv_user->gmap_tree, &find_gmap); mtx_unlock(&priv_user->user_data_lock); if (gmap != NULL && gmap->count == count) return (gmap); return (NULL); } /* * Remove the pages from the mgtdevice pager, call the unmap hypercall, * free the xenmem resource. This function is called during the * destruction of the mgtdevice pager, which happens when all mmaps to * it have been removed, and the unmap-ioctl has been performed. */ static int notify_unmap_cleanup(struct gntdev_gmap *gmap) { uint32_t i; int error, count; vm_page_t m; struct gnttab_unmap_grant_ref *unmap_ops; unmap_ops = malloc(sizeof(struct gnttab_unmap_grant_ref) * gmap->count, M_GNTDEV, M_WAITOK); /* Enumerate freeable maps. */ count = 0; for (i = 0; i < gmap->count; i++) { if (gmap->grant_map_ops[i].handle != -1) { unmap_ops[count].handle = gmap->grant_map_ops[i].handle; unmap_ops[count].host_addr = gmap->grant_map_ops[i].host_addr; unmap_ops[count].dev_bus_addr = 0; count++; } } /* Perform notification. */ if (count > 0 && gmap->notify) { vm_page_t page; uint64_t page_offset; page_offset = gmap->notify->index - gmap->file_index; page = PHYS_TO_VM_PAGE(gmap->map->phys_base_addr + page_offset); notify(gmap->notify, page); } /* Free the pages. */ VM_OBJECT_WLOCK(gmap->map->mem); retry: for (i = 0; i < gmap->count; i++) { m = vm_page_lookup(gmap->map->mem, i); if (m == NULL) continue; if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0) goto retry; cdev_pager_free_page(gmap->map->mem, m); } VM_OBJECT_WUNLOCK(gmap->map->mem); /* Perform unmap hypercall. */ error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); for (i = 0; i < gmap->count; i++) { gmap->grant_map_ops[i].handle = -1; gmap->grant_map_ops[i].host_addr = 0; } if (gmap->map) { error = xenmem_free(gntdev_dev, gmap->map->pseudo_phys_res_id, gmap->map->pseudo_phys_res); KASSERT(error == 0, ("Unable to release memory resource: %d", error)); free(gmap->map, M_GNTDEV); gmap->map = NULL; } free(unmap_ops, M_GNTDEV); return (error); } /* * IOCTL_GNTDEV_MAP_GRANT_REF * Populate structures for mapping the grant reference in the per user * private data. Actual resource allocation and map hypercall is performed * during the mmap. */ static int gntdev_map_grant_ref(struct ioctl_gntdev_map_grant_ref *arg) { uint32_t i; int error; struct gntdev_gmap *gmap; struct per_user_data *priv_user; error = devfs_get_cdevpriv((void**) &priv_user); if (error != 0) return (EINVAL); gmap = malloc(sizeof(*gmap), M_GNTDEV, M_WAITOK | M_ZERO); gmap->count = arg->count; gmap->grant_map_ops = malloc(sizeof(struct gnttab_map_grant_ref) * arg->count, M_GNTDEV, M_WAITOK | M_ZERO); for (i = 0; i < arg->count; i++) { struct ioctl_gntdev_grant_ref ref; error = copyin(&arg->refs[i], &ref, sizeof(ref)); if (error != 0) { free(gmap->grant_map_ops, M_GNTDEV); free(gmap, M_GNTDEV); return (error); } gmap->grant_map_ops[i].dom = ref.domid; gmap->grant_map_ops[i].ref = ref.ref; gmap->grant_map_ops[i].handle = -1; gmap->grant_map_ops[i].flags = GNTMAP_host_map; } error = get_file_offset(priv_user, arg->count, &gmap->file_index); if (error != 0) { free(gmap->grant_map_ops, M_GNTDEV); free(gmap, M_GNTDEV); return (error); } mtx_lock(&priv_user->user_data_lock); RB_INSERT(gmap_tree_head, &priv_user->gmap_tree, gmap); mtx_unlock(&priv_user->user_data_lock); arg->index = gmap->file_index; return (error); } /* * IOCTL_GNTDEV_UNMAP_GRANT_REF * Remove the map information from the per user private data and add it * to the global device-list of mappings to be deleted. A reference to * the mgtdevice pager is also decreased, the reason for which is * explained in mmap_gmap(). */ static int gntdev_unmap_grant_ref(struct ioctl_gntdev_unmap_grant_ref *arg) { int error; struct gntdev_gmap *gmap; struct per_user_data *priv_user; error = devfs_get_cdevpriv((void**) &priv_user); if (error != 0) return (EINVAL); gmap = gntdev_find_gmap(priv_user, arg->index, arg->count); if (gmap == NULL) { log(LOG_ERR, "Can't find requested grant-map."); return (EINVAL); } mtx_lock(&priv_user->user_data_lock); mtx_lock(&cleanup_data.to_kill_gmaps_mtx); RB_REMOVE(gmap_tree_head, &priv_user->gmap_tree, gmap); STAILQ_INSERT_TAIL(&cleanup_data.to_kill_gmaps, gmap, gmap_next.list); mtx_unlock(&cleanup_data.to_kill_gmaps_mtx); mtx_unlock(&priv_user->user_data_lock); if (gmap->map) vm_object_deallocate(gmap->map->mem); taskqueue_enqueue(taskqueue_thread, &cleanup_task); put_file_offset(priv_user, arg->count, arg->index); return (0); } /* * IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR * Get file-offset and count for a given mapping, from the virtual address * where the mapping is mmapped. * Please note, this only works for grants mapped by this domain, and not * grants allocated. Count doesn't make much sense in reference to grants * allocated. Also, because this function is present in the linux gntdev * device, but not in the linux gntalloc one, most userspace code only use * it for mapped grants. */ static int gntdev_get_offset_for_vaddr(struct ioctl_gntdev_get_offset_for_vaddr *arg, struct thread *td) { int error; vm_map_t map; vm_map_entry_t entry; vm_object_t mem; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; struct gntdev_gmap *gmap; int rc; map = &td->td_proc->p_vmspace->vm_map; error = vm_map_lookup(&map, arg->vaddr, VM_PROT_NONE, &entry, &mem, &pindex, &prot, &wired); if (error != KERN_SUCCESS) return (EINVAL); if ((mem->type != OBJT_MGTDEVICE) || (mem->un_pager.devp.ops != &gntdev_gmap_pg_ops)) { rc = EINVAL; goto out; } gmap = mem->handle; if (gmap == NULL || (entry->end - entry->start) != (gmap->count * PAGE_SIZE)) { rc = EINVAL; goto out; } arg->count = gmap->count; arg->offset = gmap->file_index; rc = 0; out: vm_map_lookup_done(map, entry); return (rc); } /*-------------------- Grant Mapping Pager ----------------------------------*/ static int gntdev_gmap_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { return (0); } static void gntdev_gmap_pg_dtor(void *handle) { notify_unmap_cleanup((struct gntdev_gmap *)handle); } static int gntdev_gmap_pg_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct gntdev_gmap *gmap = object->handle; vm_pindex_t pidx, ridx; vm_page_t page; vm_ooffset_t relative_offset; if (gmap->map == NULL) return (VM_PAGER_FAIL); relative_offset = offset - gmap->file_index; pidx = OFF_TO_IDX(offset); ridx = OFF_TO_IDX(relative_offset); if (ridx >= gmap->count || gmap->grant_map_ops[ridx].status != GNTST_okay) return (VM_PAGER_FAIL); page = PHYS_TO_VM_PAGE(gmap->map->phys_base_addr + relative_offset); if (page == NULL) return (VM_PAGER_FAIL); KASSERT((page->flags & PG_FICTITIOUS) != 0, ("not fictitious %p", page)); KASSERT(vm_page_wired(page), ("page %p is not wired", page)); KASSERT(!vm_page_busied(page), ("page %p is busy", page)); vm_page_busy_acquire(page, 0); vm_page_valid(page); if (*mres != NULL) vm_page_replace(page, object, pidx, *mres); else vm_page_insert(page, object, pidx); *mres = page; return (VM_PAGER_OK); } /*------------------ Grant Table Methods ------------------------------------*/ static void notify(struct notify_data *notify, vm_page_t page) { if (notify->action & UNMAP_NOTIFY_CLEAR_BYTE) { uint8_t *mem; uint64_t offset; offset = notify->index & PAGE_MASK; mem = (uint8_t *)pmap_quick_enter_page(page); mem[offset] = 0; pmap_quick_remove_page((vm_offset_t)mem); } if (notify->action & UNMAP_NOTIFY_SEND_EVENT) { xen_intr_signal(notify->notify_evtchn_handle); xen_intr_unbind(¬ify->notify_evtchn_handle); } notify->action = 0; } /* * Helper to copy new arguments from the notify ioctl into * the existing notify data. */ static int copy_notify_helper(struct notify_data *destination, struct ioctl_gntdev_unmap_notify *source) { xen_intr_handle_t handlep = NULL; /* * "Get" before "Put"ting previous reference, as we might be * holding the last reference to the event channel port. */ if (source->action & UNMAP_NOTIFY_SEND_EVENT) if (xen_intr_get_evtchn_from_port(source->event_channel_port, &handlep) != 0) return (EINVAL); if (destination->action & UNMAP_NOTIFY_SEND_EVENT) xen_intr_unbind(&destination->notify_evtchn_handle); destination->action = source->action; destination->event_channel_port = source->event_channel_port; destination->index = source->index; destination->notify_evtchn_handle = handlep; return (0); } /* * IOCTL_GNTDEV_SET_UNMAP_NOTIFY * Set unmap notification inside the appropriate grant. It sends a * notification when the grant is completely munmapped by this domain * and ready for destruction. */ static int gntdev_set_unmap_notify(struct ioctl_gntdev_unmap_notify *arg) { int error; uint64_t index; struct per_user_data *priv_user; struct gntdev_gref *gref = NULL; struct gntdev_gmap *gmap; error = devfs_get_cdevpriv((void**) &priv_user); if (error != 0) return (EINVAL); if (arg->action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) return (EINVAL); index = arg->index & ~PAGE_MASK; gref = gntdev_find_grefs(priv_user, index, 1); if (gref) { if (gref->notify == NULL) gref->notify = malloc(sizeof(*arg), M_GNTDEV, M_WAITOK | M_ZERO); return (copy_notify_helper(gref->notify, arg)); } error = EINVAL; mtx_lock(&priv_user->user_data_lock); RB_FOREACH(gmap, gmap_tree_head, &priv_user->gmap_tree) { if (arg->index >= gmap->file_index && arg->index < gmap->file_index + gmap->count * PAGE_SIZE) { if (gmap->notify == NULL) gmap->notify = malloc(sizeof(*arg), M_GNTDEV, M_WAITOK | M_ZERO); error = copy_notify_helper(gmap->notify, arg); break; } } mtx_unlock(&priv_user->user_data_lock); return (error); } /*------------------ Gntdev Char Device Methods -----------------------------*/ static void cleanup_function(void *arg, __unused int pending) { gref_list_dtor((struct cleanup_data_struct *) arg); gmap_list_dtor((struct cleanup_data_struct *) arg); } static void per_user_data_dtor(void *arg) { struct gntdev_gref *gref, *gref_tmp; struct gntdev_gmap *gmap, *gmap_tmp; struct file_offset_struct *offset, *offset_tmp; struct per_user_data *priv_user; priv_user = (struct per_user_data *) arg; mtx_lock(&priv_user->user_data_lock); mtx_lock(&cleanup_data.to_kill_grefs_mtx); RB_FOREACH_SAFE(gref, gref_tree_head, &priv_user->gref_tree, gref_tmp) { RB_REMOVE(gref_tree_head, &priv_user->gref_tree, gref); STAILQ_INSERT_TAIL(&cleanup_data.to_kill_grefs, gref, gref_next.list); } mtx_unlock(&cleanup_data.to_kill_grefs_mtx); mtx_lock(&cleanup_data.to_kill_gmaps_mtx); RB_FOREACH_SAFE(gmap, gmap_tree_head, &priv_user->gmap_tree, gmap_tmp) { RB_REMOVE(gmap_tree_head, &priv_user->gmap_tree, gmap); STAILQ_INSERT_TAIL(&cleanup_data.to_kill_gmaps, gmap, gmap_next.list); if (gmap->map) vm_object_deallocate(gmap->map->mem); } mtx_unlock(&cleanup_data.to_kill_gmaps_mtx); RB_FOREACH_SAFE(offset, file_offset_head, &priv_user->file_offset, offset_tmp) { RB_REMOVE(file_offset_head, &priv_user->file_offset, offset); free(offset, M_GNTDEV); } mtx_unlock(&priv_user->user_data_lock); taskqueue_enqueue(taskqueue_thread, &cleanup_task); mtx_destroy(&priv_user->user_data_lock); free(priv_user, M_GNTDEV); } static int gntdev_open(struct cdev *dev, int flag, int otyp, struct thread *td) { int error; struct per_user_data *priv_user; struct file_offset_struct *offset; priv_user = malloc(sizeof(*priv_user), M_GNTDEV, M_WAITOK | M_ZERO); RB_INIT(&priv_user->gref_tree); RB_INIT(&priv_user->gmap_tree); RB_INIT(&priv_user->file_offset); offset = malloc(sizeof(*offset), M_GNTDEV, M_WAITOK | M_ZERO); offset->file_offset = 0; offset->count = MAX_OFFSET_COUNT; RB_INSERT(file_offset_head, &priv_user->file_offset, offset); mtx_init(&priv_user->user_data_lock, "per user data mutex", NULL, MTX_DEF); error = devfs_set_cdevpriv(priv_user, per_user_data_dtor); if (error != 0) per_user_data_dtor(priv_user); return (error); } static int gntdev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error; switch (cmd) { case IOCTL_GNTDEV_SET_UNMAP_NOTIFY: error = gntdev_set_unmap_notify( (struct ioctl_gntdev_unmap_notify*) data); break; case IOCTL_GNTDEV_ALLOC_GREF: error = gntdev_alloc_gref( (struct ioctl_gntdev_alloc_gref*) data); break; case IOCTL_GNTDEV_DEALLOC_GREF: error = gntdev_dealloc_gref( (struct ioctl_gntdev_dealloc_gref*) data); break; case IOCTL_GNTDEV_MAP_GRANT_REF: error = gntdev_map_grant_ref( (struct ioctl_gntdev_map_grant_ref*) data); break; case IOCTL_GNTDEV_UNMAP_GRANT_REF: error = gntdev_unmap_grant_ref( (struct ioctl_gntdev_unmap_grant_ref*) data); break; case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: error = gntdev_get_offset_for_vaddr( (struct ioctl_gntdev_get_offset_for_vaddr*) data, td); break; default: error = ENOSYS; break; } return (error); } /* * MMAP an allocated grant into user memory. * Please note, that the grants must not already be mmapped, otherwise * this function will fail. */ static int mmap_gref(struct per_user_data *priv_user, struct gntdev_gref *gref_start, uint32_t count, vm_size_t size, struct vm_object **object) { vm_object_t mem_obj; struct gntdev_gref *gref; mem_obj = vm_pager_allocate(OBJT_PHYS, NULL, size, VM_PROT_ALL, 0, curthread->td_ucred); if (mem_obj == NULL) return (ENOMEM); mtx_lock(&priv_user->user_data_lock); VM_OBJECT_WLOCK(mem_obj); for (gref = gref_start; gref != NULL && count > 0; gref = RB_NEXT(gref_tree_head, &priv_user->gref_tree, gref)) { if (gref->page->object) break; vm_page_insert(gref->page, mem_obj, OFF_TO_IDX(gref->file_index)); count--; } VM_OBJECT_WUNLOCK(mem_obj); mtx_unlock(&priv_user->user_data_lock); if (count) { vm_object_deallocate(mem_obj); return (EINVAL); } *object = mem_obj; return (0); } /* * MMAP a mapped grant into user memory. */ static int mmap_gmap(struct per_user_data *priv_user, struct gntdev_gmap *gmap_start, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot) { uint32_t i; int error; /* * The grant map hypercall might already be done. * If that is the case, increase a reference to the * vm object and return the already allocated object. */ if (gmap_start->map) { vm_object_reference(gmap_start->map->mem); *object = gmap_start->map->mem; return (0); } gmap_start->map = malloc(sizeof(*(gmap_start->map)), M_GNTDEV, M_WAITOK | M_ZERO); /* Allocate the xen pseudo physical memory resource. */ gmap_start->map->pseudo_phys_res_id = 0; gmap_start->map->pseudo_phys_res = xenmem_alloc(gntdev_dev, &gmap_start->map->pseudo_phys_res_id, size); if (gmap_start->map->pseudo_phys_res == NULL) { free(gmap_start->map, M_GNTDEV); gmap_start->map = NULL; return (ENOMEM); } gmap_start->map->phys_base_addr = rman_get_start(gmap_start->map->pseudo_phys_res); /* Allocate the mgtdevice pager. */ gmap_start->map->mem = cdev_pager_allocate(gmap_start, OBJT_MGTDEVICE, &gntdev_gmap_pg_ops, size, nprot, *offset, NULL); if (gmap_start->map->mem == NULL) { xenmem_free(gntdev_dev, gmap_start->map->pseudo_phys_res_id, gmap_start->map->pseudo_phys_res); free(gmap_start->map, M_GNTDEV); gmap_start->map = NULL; return (ENOMEM); } for (i = 0; i < gmap_start->count; i++) { gmap_start->grant_map_ops[i].host_addr = gmap_start->map->phys_base_addr + i * PAGE_SIZE; if ((nprot & PROT_WRITE) == 0) gmap_start->grant_map_ops[i].flags |= GNTMAP_readonly; } /* Make the MAP hypercall. */ error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gmap_start->grant_map_ops, gmap_start->count); if (error != 0) { /* * Deallocate pager. * Pager deallocation will automatically take care of * xenmem deallocation, etc. */ vm_object_deallocate(gmap_start->map->mem); return (EINVAL); } /* Retry EAGAIN maps. */ for (i = 0; i < gmap_start->count; i++) { int delay = 1; while (delay < 256 && gmap_start->grant_map_ops[i].status == GNTST_eagain) { HYPERVISOR_grant_table_op( GNTTABOP_map_grant_ref, &gmap_start->grant_map_ops[i], 1); pause(("gntmap"), delay * SBT_1MS); delay++; } if (gmap_start->grant_map_ops[i].status == GNTST_eagain) gmap_start->grant_map_ops[i].status = GNTST_bad_page; if (gmap_start->grant_map_ops[i].status != GNTST_okay) { /* * Deallocate pager. * Pager deallocation will automatically take care of * xenmem deallocation, notification, unmap hypercall, * etc. */ vm_object_deallocate(gmap_start->map->mem); return (EINVAL); } } /* * Add a reference to the vm object. We do not want * the vm object to be deleted when all the mmaps are * unmapped, because it may be re-mmapped. Instead, * we want the object to be deleted, when along with * munmaps, we have also processed the unmap-ioctl. */ vm_object_reference(gmap_start->map->mem); *object = gmap_start->map->mem; return (0); } static int gntdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot) { int error; uint32_t count; struct gntdev_gref *gref_start; struct gntdev_gmap *gmap_start; struct per_user_data *priv_user; error = devfs_get_cdevpriv((void**) &priv_user); if (error != 0) return (EINVAL); count = OFF_TO_IDX(size); gref_start = gntdev_find_grefs(priv_user, *offset, count); if (gref_start) { error = mmap_gref(priv_user, gref_start, count, size, object); return (error); } gmap_start = gntdev_find_gmap(priv_user, *offset, count); if (gmap_start) { error = mmap_gmap(priv_user, gmap_start, offset, size, object, nprot); return (error); } return (EINVAL); } /*------------------ Private Device Attachment Functions --------------------*/ static void gntdev_identify(driver_t *driver, device_t parent) { KASSERT((xen_domain()), ("Trying to attach gntdev device on non Xen domain")); if (BUS_ADD_CHILD(parent, 0, "gntdev", 0) == NULL) panic("unable to attach gntdev user-space device"); } static int gntdev_probe(device_t dev) { gntdev_dev = dev; device_set_desc(dev, "Xen grant-table user-space device"); return (BUS_PROBE_NOWILDCARD); } static int gntdev_attach(device_t dev) { make_dev_credf(MAKEDEV_ETERNAL, &gntdev_devsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "xen/gntdev"); return (0); } /*-------------------- Private Device Attachment Data -----------------------*/ static device_method_t gntdev_methods[] = { DEVMETHOD(device_identify, gntdev_identify), DEVMETHOD(device_probe, gntdev_probe), DEVMETHOD(device_attach, gntdev_attach), DEVMETHOD_END }; static driver_t gntdev_driver = { "gntdev", gntdev_methods, 0, }; devclass_t gntdev_devclass; DRIVER_MODULE(gntdev, xenpv, gntdev_driver, gntdev_devclass, 0, 0); MODULE_DEPEND(gntdev, xenpv, 1, 1, 1); diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c index 73ce2afefd20..98f1f3d642b3 100644 --- a/sys/fs/nfs/nfs_commonsubs.c +++ b/sys/fs/nfs/nfs_commonsubs.c @@ -1,4930 +1,4925 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1; /* And other global data */ nfstype nfsv34_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, NFSOCK, NFFIFO, NFNON }; enum vtype newnv2tov_type[8] = { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; enum vtype nv34tov_type[8]={ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO }; struct timeval nfsboottime; /* Copy boottime once, so it never changes */ int nfscl_ticks; int nfsrv_useacl = 1; struct nfssockreq nfsrv_nfsuserdsock; nfsuserd_state nfsrv_nfsuserd = NOTRUNNING; static int nfsrv_userdupcalls = 0; struct nfsreqhead nfsd_reqq; uid_t nfsrv_defaultuid = UID_NOBODY; gid_t nfsrv_defaultgid = GID_NOGROUP; int nfsrv_lease = NFSRV_LEASE; int ncl_mbuf_mlen = MLEN; int nfsd_enable_stringtouid = 0; int nfsrv_doflexfile = 0; static int nfs_enable_uidtostring = 0; NFSNAMEIDMUTEX; NFSSOCKMUTEX; extern int nfsrv_lughashsize; extern struct mtx nfsrv_dslock_mtx; extern volatile int nfsrv_devidcnt; extern int nfscl_debuglevel; extern struct nfsdevicehead nfsrv_devidhead; extern struct nfsstatsv1 nfsstatsv1; extern uint32_t nfs_srvmaxio; SYSCTL_DECL(_vfs_nfs); SYSCTL_INT(_vfs_nfs, OID_AUTO, enable_uidtostring, CTLFLAG_RW, &nfs_enable_uidtostring, 0, "Make nfs always send numeric owner_names"); int nfsrv_maxpnfsmirror = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, pnfsmirror, CTLFLAG_RD, &nfsrv_maxpnfsmirror, 0, "Mirror level for pNFS service"); /* * This array of structures indicates, for V4: * retfh - which of 3 types of calling args are used * 0 - doesn't change cfh or use a sfh * 1 - replaces cfh with a new one (unless it returns an error status) * 2 - uses cfh and sfh * needscfh - if the op wants a cfh and premtime * 0 - doesn't use a cfh * 1 - uses a cfh, but doesn't want pre-op attributes * 2 - uses a cfh and wants pre-op attributes * savereply - indicates a non-idempotent Op * 0 - not non-idempotent * 1 - non-idempotent * Ops that are ordered via seqid# are handled separately from these * non-idempotent Ops. * Define it here, since it is used by both the client and server. */ struct nfsv4_opflag nfsv4_opflag[NFSV42_NOPS] = { { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* undef */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* undef */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* undef */ { 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Access */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Close */ { 0, 2, 0, 1, LK_EXCLUSIVE, 1, 1 }, /* Commit */ { 1, 2, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Create */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Delegpurge */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Delegreturn */ { 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Getattr */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* GetFH */ { 2, 1, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Link */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Lock */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* LockT */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* LockU */ { 1, 2, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Lookup */ { 1, 2, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Lookupp */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* NVerify */ { 1, 1, 0, 1, LK_EXCLUSIVE, 1, 0 }, /* Open */ { 1, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenAttr */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenConfirm */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenDowngrade */ { 1, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* PutFH */ { 1, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* PutPubFH */ { 1, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* PutRootFH */ { 0, 1, 0, 0, LK_SHARED, 1, 0 }, /* Read */ { 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Readdir */ { 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* ReadLink */ { 0, 2, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Remove */ { 2, 1, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Rename */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Renew */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* RestoreFH */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SaveFH */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SecInfo */ { 0, 2, 1, 1, LK_EXCLUSIVE, 1, 0 }, /* Setattr */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SetClientID */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SetClientIDConfirm */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Verify */ { 0, 2, 1, 1, LK_EXCLUSIVE, 1, 0 }, /* Write */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* ReleaseLockOwner */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Backchannel Ctrl */ { 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Bind Conn to Sess */ { 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Exchange ID */ { 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Create Session */ { 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Destroy Session */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Free StateID */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Get Dir Deleg */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Get Device Info */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Get Device List */ { 0, 1, 0, 1, LK_EXCLUSIVE, 1, 1 }, /* Layout Commit */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Layout Get */ { 0, 1, 0, 1, LK_EXCLUSIVE, 1, 0 }, /* Layout Return */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Secinfo No name */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Sequence */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Set SSV */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Test StateID */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Want Delegation */ { 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Destroy ClientID */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Reclaim Complete */ { 0, 1, 1, 1, LK_EXCLUSIVE, 1, 0 }, /* Allocate */ { 2, 1, 1, 0, LK_SHARED, 1, 0 }, /* Copy */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Copy Notify */ { 0, 2, 1, 1, LK_EXCLUSIVE, 1, 0 }, /* Deallocate */ { 0, 1, 0, 0, LK_SHARED, 1, 0 }, /* IO Advise */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Layout Error */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Layout Stats */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Offload Cancel */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Offload Status */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Read Plus */ { 0, 1, 0, 0, LK_SHARED, 1, 0 }, /* Seek */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Write Same */ { 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Clone */ { 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Getxattr */ { 0, 1, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Setxattr */ { 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Listxattrs */ { 0, 1, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Removexattr */ }; static int ncl_mbuf_mhlen = MHLEN; static int nfsrv_usercnt = 0; static int nfsrv_dnsnamelen; static u_char *nfsrv_dnsname = NULL; static int nfsrv_usermax = 999999999; struct nfsrv_lughash { struct mtx mtx; struct nfsuserhashhead lughead; }; static struct nfsrv_lughash *nfsuserhash; static struct nfsrv_lughash *nfsusernamehash; static struct nfsrv_lughash *nfsgrouphash; static struct nfsrv_lughash *nfsgroupnamehash; /* * This static array indicates whether or not the RPC generates a large * reply. This is used by nfs_reply() to decide whether or not an mbuf * cluster should be allocated. (If a cluster is required by an RPC * marked 0 in this array, the code will still work, just not quite as * efficiently.) */ static int nfs_bigreply[NFSV42_NPROCS] = { 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0 }; /* local functions */ static int nfsrv_skipace(struct nfsrv_descript *nd, int *acesizep); static void nfsv4_wanted(struct nfsv4lock *lp); static int nfsrv_cmpmixedcase(u_char *cp, u_char *cp2, int len); static int nfsrv_getuser(int procnum, uid_t uid, gid_t gid, char *name); static void nfsrv_removeuser(struct nfsusrgrp *usrp, int isuser); static int nfsrv_getrefstr(struct nfsrv_descript *, u_char **, u_char **, int *, int *); static void nfsrv_refstrbigenough(int, u_char **, u_char **, int *); static struct { int op; int opcnt; const u_char *tag; int taglen; } nfsv4_opmap[NFSV42_NPROCS] = { { 0, 1, "Null", 4 }, { NFSV4OP_GETATTR, 1, "Getattr", 7, }, { NFSV4OP_SETATTR, 2, "Setattr", 7, }, { NFSV4OP_LOOKUP, 3, "Lookup", 6, }, { NFSV4OP_ACCESS, 2, "Access", 6, }, { NFSV4OP_READLINK, 2, "Readlink", 8, }, { NFSV4OP_READ, 1, "Read", 4, }, { NFSV4OP_WRITE, 2, "Write", 5, }, { NFSV4OP_OPEN, 5, "Open", 4, }, { NFSV4OP_CREATE, 5, "Create", 6, }, { NFSV4OP_CREATE, 1, "Create", 6, }, { NFSV4OP_CREATE, 3, "Create", 6, }, { NFSV4OP_REMOVE, 1, "Remove", 6, }, { NFSV4OP_REMOVE, 1, "Remove", 6, }, { NFSV4OP_SAVEFH, 5, "Rename", 6, }, { NFSV4OP_SAVEFH, 4, "Link", 4, }, { NFSV4OP_READDIR, 2, "Readdir", 7, }, { NFSV4OP_READDIR, 2, "Readdir", 7, }, { NFSV4OP_GETATTR, 1, "Getattr", 7, }, { NFSV4OP_GETATTR, 1, "Getattr", 7, }, { NFSV4OP_GETATTR, 1, "Getattr", 7, }, { NFSV4OP_COMMIT, 2, "Commit", 6, }, { NFSV4OP_LOOKUPP, 3, "Lookupp", 7, }, { NFSV4OP_SETCLIENTID, 1, "SetClientID", 11, }, { NFSV4OP_SETCLIENTIDCFRM, 1, "SetClientIDConfirm", 18, }, { NFSV4OP_LOCK, 1, "Lock", 4, }, { NFSV4OP_LOCKU, 1, "LockU", 5, }, { NFSV4OP_OPEN, 2, "Open", 4, }, { NFSV4OP_CLOSE, 1, "Close", 5, }, { NFSV4OP_OPENCONFIRM, 1, "Openconfirm", 11, }, { NFSV4OP_LOCKT, 1, "LockT", 5, }, { NFSV4OP_OPENDOWNGRADE, 1, "Opendowngrade", 13, }, { NFSV4OP_RENEW, 1, "Renew", 5, }, { NFSV4OP_PUTROOTFH, 1, "Dirpath", 7, }, { NFSV4OP_RELEASELCKOWN, 1, "Rellckown", 9, }, { NFSV4OP_DELEGRETURN, 1, "Delegret", 8, }, { NFSV4OP_DELEGRETURN, 3, "DelegRemove", 11, }, { NFSV4OP_DELEGRETURN, 7, "DelegRename1", 12, }, { NFSV4OP_DELEGRETURN, 9, "DelegRename2", 12, }, { NFSV4OP_GETATTR, 1, "Getacl", 6, }, { NFSV4OP_SETATTR, 1, "Setacl", 6, }, { NFSV4OP_EXCHANGEID, 1, "ExchangeID", 10, }, { NFSV4OP_CREATESESSION, 1, "CreateSession", 13, }, { NFSV4OP_DESTROYSESSION, 1, "DestroySession", 14, }, { NFSV4OP_DESTROYCLIENTID, 1, "DestroyClient", 13, }, { NFSV4OP_FREESTATEID, 1, "FreeStateID", 11, }, { NFSV4OP_LAYOUTGET, 1, "LayoutGet", 9, }, { NFSV4OP_GETDEVINFO, 1, "GetDeviceInfo", 13, }, { NFSV4OP_LAYOUTCOMMIT, 1, "LayoutCommit", 12, }, { NFSV4OP_LAYOUTRETURN, 1, "LayoutReturn", 12, }, { NFSV4OP_RECLAIMCOMPL, 1, "ReclaimComplete", 15, }, { NFSV4OP_WRITE, 1, "WriteDS", 7, }, { NFSV4OP_READ, 1, "ReadDS", 6, }, { NFSV4OP_COMMIT, 1, "CommitDS", 8, }, { NFSV4OP_OPEN, 3, "OpenLayoutGet", 13, }, { NFSV4OP_OPEN, 8, "CreateLayGet", 12, }, { NFSV4OP_IOADVISE, 1, "Advise", 6, }, { NFSV4OP_ALLOCATE, 2, "Allocate", 8, }, { NFSV4OP_SAVEFH, 5, "Copy", 4, }, { NFSV4OP_SEEK, 2, "Seek", 4, }, { NFSV4OP_SEEK, 1, "SeekDS", 6, }, { NFSV4OP_GETXATTR, 2, "Getxattr", 8, }, { NFSV4OP_SETXATTR, 2, "Setxattr", 8, }, { NFSV4OP_REMOVEXATTR, 2, "Rmxattr", 7, }, { NFSV4OP_LISTXATTRS, 2, "Listxattr", 9, }, { NFSV4OP_BINDCONNTOSESS, 1, "BindConSess", 11, }, { NFSV4OP_LOOKUP, 5, "LookupOpen", 10, }, { NFSV4OP_DEALLOCATE, 2, "Deallocate", 10, }, }; /* * NFS RPCS that have large request message size. */ static int nfs_bigrequest[NFSV42_NPROCS] = { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 }; /* * Start building a request. Mostly just put the first file handle in * place. */ void nfscl_reqstart(struct nfsrv_descript *nd, int procnum, struct nfsmount *nmp, u_int8_t *nfhp, int fhlen, u_int32_t **opcntpp, struct nfsclsession *sep, int vers, int minorvers) { struct mbuf *mb; u_int32_t *tl; int opcnt; nfsattrbit_t attrbits; /* * First, fill in some of the fields of nd. */ nd->nd_slotseq = NULL; if (vers == NFS_VER4) { nd->nd_flag = ND_NFSV4 | ND_NFSCL; if (minorvers == NFSV41_MINORVERSION) nd->nd_flag |= ND_NFSV41; else if (minorvers == NFSV42_MINORVERSION) nd->nd_flag |= (ND_NFSV41 | ND_NFSV42); } else if (vers == NFS_VER3) nd->nd_flag = ND_NFSV3 | ND_NFSCL; else { if (NFSHASNFSV4(nmp)) { nd->nd_flag = ND_NFSV4 | ND_NFSCL; if (nmp->nm_minorvers == 1) nd->nd_flag |= ND_NFSV41; else if (nmp->nm_minorvers == 2) nd->nd_flag |= (ND_NFSV41 | ND_NFSV42); } else if (NFSHASNFSV3(nmp)) nd->nd_flag = ND_NFSV3 | ND_NFSCL; else nd->nd_flag = ND_NFSV2 | ND_NFSCL; } nd->nd_procnum = procnum; nd->nd_repstat = 0; nd->nd_maxextsiz = 0; /* * Get the first mbuf for the request. */ if (nfs_bigrequest[procnum]) NFSMCLGET(mb, M_WAITOK); else NFSMGET(mb); mb->m_len = 0; nd->nd_mreq = nd->nd_mb = mb; nd->nd_bpos = mtod(mb, char *); /* * And fill the first file handle into the request. */ if (nd->nd_flag & ND_NFSV4) { opcnt = nfsv4_opmap[procnum].opcnt + nfsv4_opflag[nfsv4_opmap[procnum].op].needscfh; if ((nd->nd_flag & ND_NFSV41) != 0) { opcnt += nfsv4_opflag[nfsv4_opmap[procnum].op].needsseq; if (procnum == NFSPROC_RENEW) /* * For the special case of Renew, just do a * Sequence Op. */ opcnt = 1; else if (procnum == NFSPROC_WRITEDS || procnum == NFSPROC_COMMITDS) /* * For the special case of a Writeor Commit to * a DS, the opcnt == 3, for Sequence, PutFH, * Write/Commit. */ opcnt = 3; } /* * What should the tag really be? */ (void) nfsm_strtom(nd, nfsv4_opmap[procnum].tag, nfsv4_opmap[procnum].taglen); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if ((nd->nd_flag & ND_NFSV42) != 0) *tl++ = txdr_unsigned(NFSV42_MINORVERSION); else if ((nd->nd_flag & ND_NFSV41) != 0) *tl++ = txdr_unsigned(NFSV41_MINORVERSION); else *tl++ = txdr_unsigned(NFSV4_MINORVERSION); if (opcntpp != NULL) *opcntpp = tl; *tl = txdr_unsigned(opcnt); if ((nd->nd_flag & ND_NFSV41) != 0 && nfsv4_opflag[nfsv4_opmap[procnum].op].needsseq > 0) { if (nfsv4_opflag[nfsv4_opmap[procnum].op].loopbadsess > 0) nd->nd_flag |= ND_LOOPBADSESS; NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_SEQUENCE); if (sep == NULL) { sep = nfsmnt_mdssession(nmp); nfsv4_setsequence(nmp, nd, sep, nfs_bigreply[procnum]); } else nfsv4_setsequence(nmp, nd, sep, nfs_bigreply[procnum]); } if (nfsv4_opflag[nfsv4_opmap[procnum].op].needscfh > 0) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); (void) nfsm_fhtom(nd, nfhp, fhlen, 0); if (nfsv4_opflag[nfsv4_opmap[procnum].op].needscfh == 2 && procnum != NFSPROC_WRITEDS && procnum != NFSPROC_COMMITDS) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); /* * For Lookup Ops, we want all the directory * attributes, so we can load the name cache. */ if (procnum == NFSPROC_LOOKUP || procnum == NFSPROC_LOOKUPP || procnum == NFSPROC_LOOKUPOPEN) NFSGETATTR_ATTRBIT(&attrbits); else { NFSWCCATTR_ATTRBIT(&attrbits); nd->nd_flag |= ND_V4WCCATTR; } (void) nfsrv_putattrbit(nd, &attrbits); } } if (procnum != NFSPROC_RENEW || (nd->nd_flag & ND_NFSV41) == 0) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(nfsv4_opmap[procnum].op); } } else { (void) nfsm_fhtom(nd, nfhp, fhlen, 0); } if (procnum < NFSV42_NPROCS) NFSINCRGLOBAL(nfsstatsv1.rpccnt[procnum]); } /* * Put a state Id in the mbuf list. */ void nfsm_stateidtom(struct nfsrv_descript *nd, nfsv4stateid_t *stateidp, int flag) { nfsv4stateid_t *st; NFSM_BUILD(st, nfsv4stateid_t *, NFSX_STATEID); if (flag == NFSSTATEID_PUTALLZERO) { st->seqid = 0; st->other[0] = 0; st->other[1] = 0; st->other[2] = 0; } else if (flag == NFSSTATEID_PUTALLONE) { st->seqid = 0xffffffff; st->other[0] = 0xffffffff; st->other[1] = 0xffffffff; st->other[2] = 0xffffffff; } else if (flag == NFSSTATEID_PUTSEQIDZERO) { st->seqid = 0; st->other[0] = stateidp->other[0]; st->other[1] = stateidp->other[1]; st->other[2] = stateidp->other[2]; } else { st->seqid = stateidp->seqid; st->other[0] = stateidp->other[0]; st->other[1] = stateidp->other[1]; st->other[2] = stateidp->other[2]; } } /* * Fill in the setable attributes. The full argument indicates whether * to fill in them all or just mode and time. */ void nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap, struct vnode *vp, int flags, u_int32_t rdev) { u_int32_t *tl; struct nfsv2_sattr *sp; nfsattrbit_t attrbits; struct nfsnode *np; switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) { case ND_NFSV2: NFSM_BUILD(sp, struct nfsv2_sattr *, NFSX_V2SATTR); if (vap->va_mode == (mode_t)VNOVAL) sp->sa_mode = newnfs_xdrneg1; else sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); if (vap->va_uid == (uid_t)VNOVAL) sp->sa_uid = newnfs_xdrneg1; else sp->sa_uid = txdr_unsigned(vap->va_uid); if (vap->va_gid == (gid_t)VNOVAL) sp->sa_gid = newnfs_xdrneg1; else sp->sa_gid = txdr_unsigned(vap->va_gid); if (flags & NFSSATTR_SIZE0) sp->sa_size = 0; else if (flags & NFSSATTR_SIZENEG1) sp->sa_size = newnfs_xdrneg1; else if (flags & NFSSATTR_SIZERDEV) sp->sa_size = txdr_unsigned(rdev); else sp->sa_size = txdr_unsigned(vap->va_size); txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); break; case ND_NFSV3: if (vap->va_mode != (mode_t)VNOVAL) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = newnfs_true; *tl = txdr_unsigned(vap->va_mode); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_false; } if ((flags & NFSSATTR_FULL) && vap->va_uid != (uid_t)VNOVAL) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = newnfs_true; *tl = txdr_unsigned(vap->va_uid); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_false; } if ((flags & NFSSATTR_FULL) && vap->va_gid != (gid_t)VNOVAL) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = newnfs_true; *tl = txdr_unsigned(vap->va_gid); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_false; } if ((flags & NFSSATTR_FULL) && vap->va_size != VNOVAL) { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = newnfs_true; txdr_hyper(vap->va_size, tl); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_false; } if (vap->va_atime.tv_sec != VNOVAL) { if ((vap->va_vaflags & VA_UTIMES_NULL) == 0) { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); txdr_nfsv3time(&vap->va_atime, tl); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER); } } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE); } if (vap->va_mtime.tv_sec != VNOVAL) { if ((vap->va_vaflags & VA_UTIMES_NULL) == 0) { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); txdr_nfsv3time(&vap->va_mtime, tl); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER); } } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE); } break; case ND_NFSV4: NFSZERO_ATTRBIT(&attrbits); if (vap->va_mode != (mode_t)VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MODE); if ((flags & NFSSATTR_FULL) && vap->va_uid != (uid_t)VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNER); if ((flags & NFSSATTR_FULL) && vap->va_gid != (gid_t)VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNERGROUP); if ((flags & NFSSATTR_FULL) && vap->va_size != VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE); if (vap->va_atime.tv_sec != VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET); if (vap->va_mtime.tv_sec != VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFYSET); if (vap->va_birthtime.tv_sec != VNOVAL && strcmp(vp->v_mount->mnt_vfc->vfc_name, "nfs") == 0) { /* * We can only test for support of TimeCreate if * the "vp" argument is for an NFS vnode. */ np = VTONFS(vp); if (NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, NFSATTRBIT_TIMECREATE)) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE); } (void) nfsv4_fillattr(nd, vp->v_mount, vp, NULL, vap, NULL, 0, &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL); break; } } #ifndef APPLE /* * copies mbuf chain to the uio scatter/gather list */ int nfsm_mbufuio(struct nfsrv_descript *nd, struct uio *uiop, int siz) { char *mbufcp, *uiocp; int xfer, left, len; struct mbuf *mp; long uiosiz, rem; int error = 0; mp = nd->nd_md; mbufcp = nd->nd_dpos; len = mtod(mp, caddr_t) + mp->m_len - mbufcp; rem = NFSM_RNDUP(siz) - siz; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) { error = EBADRPC; goto out; } left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { while (len == 0) { mp = mp->m_next; if (mp == NULL) { error = EBADRPC; goto out; } mbufcp = mtod(mp, caddr_t); len = mp->m_len; KASSERT(len >= 0, ("len %d, corrupted mbuf?", len)); } xfer = (left > len) ? len : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (mbufcp, uiocp, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) NFSBCOPY(mbufcp, uiocp, xfer); else copyout(mbufcp, uiocp, xfer); left -= xfer; len -= xfer; mbufcp += xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base = (void *) ((char *)uiop->uio_iov->iov_base + uiosiz); uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } nd->nd_dpos = mbufcp; nd->nd_md = mp; if (rem > 0) { if (len < rem) error = nfsm_advance(nd, rem, len); else nd->nd_dpos += rem; } out: NFSEXITCODE2(error, nd); return (error); } #endif /* !APPLE */ /* * Help break down an mbuf chain by setting the first siz bytes contiguous * pointed to by returned val. * This is used by the macro NFSM_DISSECT for tough * cases. */ void * nfsm_dissct(struct nfsrv_descript *nd, int siz, int how) { struct mbuf *mp2; int siz2, xfer; caddr_t p; int left; caddr_t retp; retp = NULL; left = mtod(nd->nd_md, caddr_t) + nd->nd_md->m_len - nd->nd_dpos; while (left == 0) { nd->nd_md = nd->nd_md->m_next; if (nd->nd_md == NULL) return (retp); left = nd->nd_md->m_len; nd->nd_dpos = mtod(nd->nd_md, caddr_t); } if (left >= siz) { retp = nd->nd_dpos; nd->nd_dpos += siz; } else if (nd->nd_md->m_next == NULL) { return (retp); } else if (siz > ncl_mbuf_mhlen) { panic("nfs S too big"); } else { MGET(mp2, MT_DATA, how); if (mp2 == NULL) return (NULL); mp2->m_next = nd->nd_md->m_next; nd->nd_md->m_next = mp2; nd->nd_md->m_len -= left; nd->nd_md = mp2; retp = p = mtod(mp2, caddr_t); NFSBCOPY(nd->nd_dpos, p, left); /* Copy what was left */ siz2 = siz - left; p += left; mp2 = mp2->m_next; /* Loop around copying up the siz2 bytes */ while (siz2 > 0) { if (mp2 == NULL) return (NULL); xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; if (xfer > 0) { NFSBCOPY(mtod(mp2, caddr_t), p, xfer); mp2->m_data += xfer; mp2->m_len -= xfer; p += xfer; siz2 -= xfer; } if (siz2 > 0) mp2 = mp2->m_next; } nd->nd_md->m_len = siz; nd->nd_md = mp2; nd->nd_dpos = mtod(mp2, caddr_t); } return (retp); } /* * Advance the position in the mbuf chain. * If offs == 0, this is a no-op, but it is simpler to just return from * here than check for offs > 0 for all calls to nfsm_advance. * If left == -1, it should be calculated here. */ int nfsm_advance(struct nfsrv_descript *nd, int offs, int left) { int error = 0; if (offs == 0) goto out; /* * A negative offs might indicate a corrupted mbuf chain and, * as such, a printf is logged. */ if (offs < 0) { printf("nfsrv_advance: negative offs\n"); error = EBADRPC; goto out; } /* * If left == -1, calculate it here. */ if (left == -1) left = mtod(nd->nd_md, caddr_t) + nd->nd_md->m_len - nd->nd_dpos; /* * Loop around, advancing over the mbuf data. */ while (offs > left) { offs -= left; nd->nd_md = nd->nd_md->m_next; if (nd->nd_md == NULL) { error = EBADRPC; goto out; } left = nd->nd_md->m_len; nd->nd_dpos = mtod(nd->nd_md, caddr_t); } nd->nd_dpos += offs; out: NFSEXITCODE(error); return (error); } /* * Copy a string into mbuf(s). * Return the number of bytes output, including XDR overheads. */ int nfsm_strtom(struct nfsrv_descript *nd, const char *cp, int siz) { struct mbuf *m2; int xfer, left; struct mbuf *m1; int rem, bytesize; u_int32_t *tl; char *cp2; NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(siz); rem = NFSM_RNDUP(siz) - siz; bytesize = NFSX_UNSIGNED + siz + rem; m2 = nd->nd_mb; cp2 = nd->nd_bpos; if ((nd->nd_flag & ND_EXTPG) != 0) left = nd->nd_bextpgsiz; else left = M_TRAILINGSPACE(m2); KASSERT(((m2->m_flags & (M_EXT | M_EXTPG)) == (M_EXT | M_EXTPG) && (nd->nd_flag & ND_EXTPG) != 0) || ((m2->m_flags & (M_EXT | M_EXTPG)) != (M_EXT | M_EXTPG) && (nd->nd_flag & ND_EXTPG) == 0), ("nfsm_strtom: ext_pgs and non-ext_pgs mbufs mixed")); /* * Loop around copying the string to mbuf(s). */ while (siz > 0) { if (left == 0) { if ((nd->nd_flag & ND_EXTPG) != 0) { m2 = nfsm_add_ext_pgs(m2, nd->nd_maxextsiz, &nd->nd_bextpg); cp2 = (char *)(void *)PHYS_TO_DMAP( m2->m_epg_pa[nd->nd_bextpg]); nd->nd_bextpgsiz = left = PAGE_SIZE; } else { if (siz > ncl_mbuf_mlen) NFSMCLGET(m1, M_WAITOK); else NFSMGET(m1); m1->m_len = 0; cp2 = mtod(m1, char *); left = M_TRAILINGSPACE(m1); m2->m_next = m1; m2 = m1; } } if (left >= siz) xfer = siz; else xfer = left; NFSBCOPY(cp, cp2, xfer); cp += xfer; cp2 += xfer; m2->m_len += xfer; siz -= xfer; left -= xfer; if ((nd->nd_flag & ND_EXTPG) != 0) { nd->nd_bextpgsiz -= xfer; m2->m_epg_last_len += xfer; } if (siz == 0 && rem) { if (left < rem) panic("nfsm_strtom"); NFSBZERO(cp2, rem); m2->m_len += rem; cp2 += rem; if ((nd->nd_flag & ND_EXTPG) != 0) { nd->nd_bextpgsiz -= rem; m2->m_epg_last_len += rem; } } } nd->nd_mb = m2; if ((nd->nd_flag & ND_EXTPG) != 0) nd->nd_bpos = cp2; else nd->nd_bpos = mtod(m2, char *) + m2->m_len; return (bytesize); } /* * Called once to initialize data structures... */ void newnfs_init(void) { static int nfs_inited = 0; if (nfs_inited) return; nfs_inited = 1; newnfs_true = txdr_unsigned(TRUE); newnfs_false = txdr_unsigned(FALSE); newnfs_xdrneg1 = txdr_unsigned(-1); nfscl_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfscl_ticks < 1) nfscl_ticks = 1; NFSSETBOOTTIME(nfsboottime); /* * Initialize reply list and start timer */ TAILQ_INIT(&nfsd_reqq); NFS_TIMERINIT; } /* * Put a file handle in an mbuf list. * If the size argument == 0, just use the default size. * set_true == 1 if there should be an newnfs_true prepended on the file handle. * Return the number of bytes output, including XDR overhead. */ int nfsm_fhtom(struct nfsrv_descript *nd, u_int8_t *fhp, int size, int set_true) { u_int32_t *tl; u_int8_t *cp; int fullsiz, rem, bytesize = 0; if (size == 0) size = NFSX_MYFH; switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) { case ND_NFSV2: if (size > NFSX_V2FH) panic("fh size > NFSX_V2FH for NFSv2"); NFSM_BUILD(cp, u_int8_t *, NFSX_V2FH); NFSBCOPY(fhp, cp, size); if (size < NFSX_V2FH) NFSBZERO(cp + size, NFSX_V2FH - size); bytesize = NFSX_V2FH; break; case ND_NFSV3: case ND_NFSV4: fullsiz = NFSM_RNDUP(size); rem = fullsiz - size; if (set_true) { bytesize = 2 * NFSX_UNSIGNED + fullsiz; NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_true; } else { bytesize = NFSX_UNSIGNED + fullsiz; } (void) nfsm_strtom(nd, fhp, size); break; } return (bytesize); } /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int nfsaddr_match(int family, union nethostaddr *haddr, NFSSOCKADDR_T nam) { #ifdef INET struct sockaddr_in *inetaddr; #endif switch (family) { #ifdef INET case AF_INET: inetaddr = NFSSOCKADDR(nam, struct sockaddr_in *); if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inet.s_addr) return (1); break; #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *inetaddr6; inetaddr6 = NFSSOCKADDR(nam, struct sockaddr_in6 *); /* XXX - should test sin6_scope_id ? */ if (inetaddr6->sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&inetaddr6->sin6_addr, &haddr->had_inet6)) return (1); } break; #endif } return (0); } /* * Similar to the above, but takes to NFSSOCKADDR_T args. */ int nfsaddr2_match(NFSSOCKADDR_T nam1, NFSSOCKADDR_T nam2) { struct sockaddr_in *addr1, *addr2; struct sockaddr *inaddr; inaddr = NFSSOCKADDR(nam1, struct sockaddr *); switch (inaddr->sa_family) { case AF_INET: addr1 = NFSSOCKADDR(nam1, struct sockaddr_in *); addr2 = NFSSOCKADDR(nam2, struct sockaddr_in *); if (addr2->sin_family == AF_INET && addr1->sin_addr.s_addr == addr2->sin_addr.s_addr) return (1); break; #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *inet6addr1, *inet6addr2; inet6addr1 = NFSSOCKADDR(nam1, struct sockaddr_in6 *); inet6addr2 = NFSSOCKADDR(nam2, struct sockaddr_in6 *); /* XXX - should test sin6_scope_id ? */ if (inet6addr2->sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&inet6addr1->sin6_addr, &inet6addr2->sin6_addr)) return (1); } break; #endif } return (0); } /* * Dissect a file handle on the client. */ int nfsm_getfh(struct nfsrv_descript *nd, struct nfsfh **nfhpp) { u_int32_t *tl; struct nfsfh *nfhp; int error, len; *nfhpp = NULL; if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if ((len = fxdr_unsigned(int, *tl)) <= 0 || len > NFSX_FHMAX) { error = EBADRPC; goto nfsmout; } } else len = NFSX_V2FH; nfhp = malloc(sizeof (struct nfsfh) + len, M_NFSFH, M_WAITOK); error = nfsrv_mtostr(nd, nfhp->nfh_fh, len); if (error) { free(nfhp, M_NFSFH); goto nfsmout; } nfhp->nfh_len = len; *nfhpp = nfhp; nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * Break down the nfsv4 acl. * If the aclp == NULL or won't fit in an acl, just discard the acl info. */ int nfsrv_dissectacl(struct nfsrv_descript *nd, NFSACL_T *aclp, int *aclerrp, int *aclsizep, __unused NFSPROC_T *p) { u_int32_t *tl; int i, aclsize; int acecnt, error = 0, aceerr = 0, acesize; *aclerrp = 0; if (aclp) aclp->acl_cnt = 0; /* * Parse out the ace entries and expect them to conform to * what can be supported by R/W/X bits. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); aclsize = NFSX_UNSIGNED; acecnt = fxdr_unsigned(int, *tl); if (acecnt > ACL_MAX_ENTRIES) aceerr = NFSERR_ATTRNOTSUPP; if (nfsrv_useacl == 0) aceerr = NFSERR_ATTRNOTSUPP; for (i = 0; i < acecnt; i++) { if (aclp && !aceerr) error = nfsrv_dissectace(nd, &aclp->acl_entry[i], &aceerr, &acesize, p); else error = nfsrv_skipace(nd, &acesize); if (error) goto nfsmout; aclsize += acesize; } if (aclp && !aceerr) aclp->acl_cnt = acecnt; if (aceerr) *aclerrp = aceerr; if (aclsizep) *aclsizep = aclsize; nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * Skip over an NFSv4 ace entry. Just dissect the xdr and discard it. */ static int nfsrv_skipace(struct nfsrv_descript *nd, int *acesizep) { u_int32_t *tl; int error, len = 0; NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); len = fxdr_unsigned(int, *(tl + 3)); error = nfsm_advance(nd, NFSM_RNDUP(len), -1); nfsmout: *acesizep = NFSM_RNDUP(len) + (4 * NFSX_UNSIGNED); NFSEXITCODE2(error, nd); return (error); } /* * Get attribute bits from an mbuf list. * Returns EBADRPC for a parsing error, 0 otherwise. * If the clearinvalid flag is set, clear the bits not supported. */ int nfsrv_getattrbits(struct nfsrv_descript *nd, nfsattrbit_t *attrbitp, int *cntp, int *retnotsupp) { u_int32_t *tl; int cnt, i, outcnt; int error = 0; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); cnt = fxdr_unsigned(int, *tl); if (cnt < 0) { error = NFSERR_BADXDR; goto nfsmout; } if (cnt > NFSATTRBIT_MAXWORDS) outcnt = NFSATTRBIT_MAXWORDS; else outcnt = cnt; NFSZERO_ATTRBIT(attrbitp); if (outcnt > 0) { NFSM_DISSECT(tl, u_int32_t *, outcnt * NFSX_UNSIGNED); for (i = 0; i < outcnt; i++) attrbitp->bits[i] = fxdr_unsigned(u_int32_t, *tl++); } for (i = 0; i < (cnt - outcnt); i++) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (retnotsupp != NULL && *tl != 0) *retnotsupp = NFSERR_ATTRNOTSUPP; } if (cntp) *cntp = NFSX_UNSIGNED + (cnt * NFSX_UNSIGNED); nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * Get the attributes for V4. * If the compare flag is true, test for any attribute changes, * otherwise return the attribute values. * These attributes cover fields in "struct vattr", "struct statfs", * "struct nfsfsinfo", the file handle and the lease duration. * The value of retcmpp is set to 1 if all attributes are the same, * and 0 otherwise. * Returns EBADRPC if it can't be parsed, 0 otherwise. */ int nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nap, struct nfsfh **nfhpp, fhandle_t *fhp, int fhsize, struct nfsv3_pathconf *pc, struct statfs *sbp, struct nfsstatfs *sfp, struct nfsfsinfo *fsp, NFSACL_T *aclp, int compare, int *retcmpp, u_int32_t *leasep, u_int32_t *rderrp, NFSPROC_T *p, struct ucred *cred) { u_int32_t *tl; int i = 0, j, k, l = 0, m, bitpos, attrsum = 0; int error, tfhsize, aceerr, attrsize, cnt, retnotsup; u_char *cp, *cp2, namestr[NFSV4_SMALLSTR + 1]; nfsattrbit_t attrbits, retattrbits, checkattrbits; struct nfsfh *tnfhp; struct nfsreferral *refp; u_quad_t tquad; nfsquad_t tnfsquad; struct timespec temptime; uid_t uid; gid_t gid; u_int32_t freenum = 0, tuint; u_int64_t uquad = 0, thyp, thyp2; #ifdef QUOTA struct dqblk dqb; uid_t savuid; #endif CTASSERT(sizeof(ino_t) == sizeof(uint64_t)); if (compare) { retnotsup = 0; error = nfsrv_getattrbits(nd, &attrbits, NULL, &retnotsup); } else { error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); } if (error) goto nfsmout; if (compare) { *retcmpp = retnotsup; } else { /* * Just set default values to some of the important ones. */ if (nap != NULL) { nap->na_type = VREG; nap->na_mode = 0; nap->na_rdev = (NFSDEV_T)0; nap->na_mtime.tv_sec = 0; nap->na_mtime.tv_nsec = 0; nap->na_btime.tv_sec = -1; nap->na_btime.tv_nsec = 0; nap->na_gen = 0; nap->na_flags = 0; nap->na_blocksize = NFS_FABLKSIZE; } if (sbp != NULL) { sbp->f_bsize = NFS_FABLKSIZE; sbp->f_blocks = 0; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 0; sbp->f_ffree = 0; } if (fsp != NULL) { fsp->fs_rtmax = 8192; fsp->fs_rtpref = 8192; fsp->fs_maxname = NFS_MAXNAMLEN; fsp->fs_wtmax = 8192; fsp->fs_wtpref = 8192; fsp->fs_wtmult = NFS_FABLKSIZE; fsp->fs_dtpref = 8192; fsp->fs_maxfilesize = 0xffffffffffffffffull; fsp->fs_timedelta.tv_sec = 0; fsp->fs_timedelta.tv_nsec = 1; fsp->fs_properties = (NFSV3_FSFLINK | NFSV3_FSFSYMLINK | NFSV3_FSFHOMOGENEOUS | NFSV3_FSFCANSETTIME); } if (pc != NULL) { pc->pc_linkmax = NFS_LINK_MAX; pc->pc_namemax = NAME_MAX; pc->pc_notrunc = 0; pc->pc_chownrestricted = 0; pc->pc_caseinsensitive = 0; pc->pc_casepreserving = 1; } if (sfp != NULL) { sfp->sf_ffiles = UINT64_MAX; sfp->sf_tfiles = UINT64_MAX; sfp->sf_afiles = UINT64_MAX; sfp->sf_fbytes = UINT64_MAX; sfp->sf_tbytes = UINT64_MAX; sfp->sf_abytes = UINT64_MAX; } } /* * Loop around getting the attributes. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsize = fxdr_unsigned(int, *tl); for (bitpos = 0; bitpos < NFSATTRBIT_MAX; bitpos++) { if (attrsum > attrsize) { error = NFSERR_BADXDR; goto nfsmout; } if (NFSISSET_ATTRBIT(&attrbits, bitpos)) switch (bitpos) { case NFSATTRBIT_SUPPORTEDATTRS: retnotsup = 0; if (compare || nap == NULL) error = nfsrv_getattrbits(nd, &retattrbits, &cnt, &retnotsup); else error = nfsrv_getattrbits(nd, &nap->na_suppattr, &cnt, &retnotsup); if (error) goto nfsmout; if (compare && !(*retcmpp)) { NFSSETSUPP_ATTRBIT(&checkattrbits, nd); /* Some filesystem do not support NFSv4ACL */ if (nfsrv_useacl == 0 || nfs_supportsnfsv4acls(vp) == 0) { NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACL); NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACLSUPPORT); } if (!NFSEQUAL_ATTRBIT(&retattrbits, &checkattrbits) || retnotsup) *retcmpp = NFSERR_NOTSAME; } attrsum += cnt; break; case NFSATTRBIT_TYPE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (nap->na_type != nfsv34tov_type(*tl)) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_type = nfsv34tov_type(*tl); } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_FHEXPIRETYPE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare && !(*retcmpp)) { if (fxdr_unsigned(int, *tl) != NFSV4FHTYPE_PERSISTENT) *retcmpp = NFSERR_NOTSAME; } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_CHANGE: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (compare) { if (!(*retcmpp)) { if (nap->na_filerev != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_filerev = fxdr_hyper(tl); } attrsum += NFSX_HYPER; break; case NFSATTRBIT_SIZE: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (compare) { if (!(*retcmpp)) { if (nap->na_size != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_size = fxdr_hyper(tl); } attrsum += NFSX_HYPER; break; case NFSATTRBIT_LINKSUPPORT: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (fsp->fs_properties & NFSV3_FSFLINK) { if (*tl == newnfs_false) *retcmpp = NFSERR_NOTSAME; } else { if (*tl == newnfs_true) *retcmpp = NFSERR_NOTSAME; } } } else if (fsp != NULL) { if (*tl == newnfs_true) fsp->fs_properties |= NFSV3_FSFLINK; else fsp->fs_properties &= ~NFSV3_FSFLINK; } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_SYMLINKSUPPORT: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (fsp->fs_properties & NFSV3_FSFSYMLINK) { if (*tl == newnfs_false) *retcmpp = NFSERR_NOTSAME; } else { if (*tl == newnfs_true) *retcmpp = NFSERR_NOTSAME; } } } else if (fsp != NULL) { if (*tl == newnfs_true) fsp->fs_properties |= NFSV3_FSFSYMLINK; else fsp->fs_properties &= ~NFSV3_FSFSYMLINK; } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_NAMEDATTR: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare && !(*retcmpp)) { if (*tl != newnfs_false) *retcmpp = NFSERR_NOTSAME; } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_FSID: NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); thyp = fxdr_hyper(tl); tl += 2; thyp2 = fxdr_hyper(tl); if (compare) { if (*retcmpp == 0) { if (thyp != (u_int64_t) vp->v_mount->mnt_stat.f_fsid.val[0] || thyp2 != (u_int64_t) vp->v_mount->mnt_stat.f_fsid.val[1]) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_filesid[0] = thyp; nap->na_filesid[1] = thyp2; } attrsum += (4 * NFSX_UNSIGNED); break; case NFSATTRBIT_UNIQUEHANDLES: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare && !(*retcmpp)) { if (*tl != newnfs_true) *retcmpp = NFSERR_NOTSAME; } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_LEASETIME: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (fxdr_unsigned(int, *tl) != nfsrv_lease && !(*retcmpp)) *retcmpp = NFSERR_NOTSAME; } else if (leasep != NULL) { *leasep = fxdr_unsigned(u_int32_t, *tl); } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_RDATTRERROR: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) *retcmpp = NFSERR_INVAL; } else if (rderrp != NULL) { *rderrp = fxdr_unsigned(u_int32_t, *tl); } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_ACL: if (compare) { if (!(*retcmpp)) { if (nfsrv_useacl && nfs_supportsnfsv4acls(vp)) { NFSACL_T *naclp; naclp = acl_alloc(M_WAITOK); error = nfsrv_dissectacl(nd, naclp, &aceerr, &cnt, p); if (error) { acl_free(naclp); goto nfsmout; } if (aceerr || aclp == NULL || nfsrv_compareacl(aclp, naclp)) *retcmpp = NFSERR_NOTSAME; acl_free(naclp); } else { error = nfsrv_dissectacl(nd, NULL, &aceerr, &cnt, p); *retcmpp = NFSERR_ATTRNOTSUPP; } } } else { if (vp != NULL && aclp != NULL) error = nfsrv_dissectacl(nd, aclp, &aceerr, &cnt, p); else error = nfsrv_dissectacl(nd, NULL, &aceerr, &cnt, p); if (error) goto nfsmout; } attrsum += cnt; break; case NFSATTRBIT_ACLSUPPORT: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare && !(*retcmpp)) { if (nfsrv_useacl && nfs_supportsnfsv4acls(vp)) { if (fxdr_unsigned(u_int32_t, *tl) != NFSV4ACE_SUPTYPES) *retcmpp = NFSERR_NOTSAME; } else { *retcmpp = NFSERR_ATTRNOTSUPP; } } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_ARCHIVE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare && !(*retcmpp)) *retcmpp = NFSERR_ATTRNOTSUPP; attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_CANSETTIME: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (fsp->fs_properties & NFSV3_FSFCANSETTIME) { if (*tl == newnfs_false) *retcmpp = NFSERR_NOTSAME; } else { if (*tl == newnfs_true) *retcmpp = NFSERR_NOTSAME; } } } else if (fsp != NULL) { if (*tl == newnfs_true) fsp->fs_properties |= NFSV3_FSFCANSETTIME; else fsp->fs_properties &= ~NFSV3_FSFCANSETTIME; } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_CASEINSENSITIVE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (*tl != newnfs_false) *retcmpp = NFSERR_NOTSAME; } } else if (pc != NULL) { pc->pc_caseinsensitive = fxdr_unsigned(u_int32_t, *tl); } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_CASEPRESERVING: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (*tl != newnfs_true) *retcmpp = NFSERR_NOTSAME; } } else if (pc != NULL) { pc->pc_casepreserving = fxdr_unsigned(u_int32_t, *tl); } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_CHOWNRESTRICTED: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (*tl != newnfs_true) *retcmpp = NFSERR_NOTSAME; } } else if (pc != NULL) { pc->pc_chownrestricted = fxdr_unsigned(u_int32_t, *tl); } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_FILEHANDLE: error = nfsm_getfh(nd, &tnfhp); if (error) goto nfsmout; tfhsize = tnfhp->nfh_len; if (compare) { if (!(*retcmpp) && !NFSRV_CMPFH(tnfhp->nfh_fh, tfhsize, fhp, fhsize)) *retcmpp = NFSERR_NOTSAME; free(tnfhp, M_NFSFH); } else if (nfhpp != NULL) { *nfhpp = tnfhp; } else { free(tnfhp, M_NFSFH); } attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(tfhsize)); break; case NFSATTRBIT_FILEID: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); thyp = fxdr_hyper(tl); if (compare) { if (!(*retcmpp)) { if (nap->na_fileid != thyp) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) nap->na_fileid = thyp; attrsum += NFSX_HYPER; break; case NFSATTRBIT_FILESAVAIL: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (compare) { if (!(*retcmpp) && sfp->sf_afiles != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } else if (sfp != NULL) { sfp->sf_afiles = fxdr_hyper(tl); } attrsum += NFSX_HYPER; break; case NFSATTRBIT_FILESFREE: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (compare) { if (!(*retcmpp) && sfp->sf_ffiles != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } else if (sfp != NULL) { sfp->sf_ffiles = fxdr_hyper(tl); } attrsum += NFSX_HYPER; break; case NFSATTRBIT_FILESTOTAL: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (compare) { if (!(*retcmpp) && sfp->sf_tfiles != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } else if (sfp != NULL) { sfp->sf_tfiles = fxdr_hyper(tl); } attrsum += NFSX_HYPER; break; case NFSATTRBIT_FSLOCATIONS: error = nfsrv_getrefstr(nd, &cp, &cp2, &l, &m); if (error) goto nfsmout; attrsum += l; if (compare && !(*retcmpp)) { refp = nfsv4root_getreferral(vp, NULL, 0); if (refp != NULL) { if (cp == NULL || cp2 == NULL || strcmp(cp, "/") || strcmp(cp2, refp->nfr_srvlist)) *retcmpp = NFSERR_NOTSAME; } else if (m == 0) { *retcmpp = NFSERR_NOTSAME; } } if (cp != NULL) free(cp, M_NFSSTRING); if (cp2 != NULL) free(cp2, M_NFSSTRING); break; case NFSATTRBIT_HIDDEN: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare && !(*retcmpp)) *retcmpp = NFSERR_ATTRNOTSUPP; attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_HOMOGENEOUS: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (fsp->fs_properties & NFSV3_FSFHOMOGENEOUS) { if (*tl == newnfs_false) *retcmpp = NFSERR_NOTSAME; } else { if (*tl == newnfs_true) *retcmpp = NFSERR_NOTSAME; } } } else if (fsp != NULL) { if (*tl == newnfs_true) fsp->fs_properties |= NFSV3_FSFHOMOGENEOUS; else fsp->fs_properties &= ~NFSV3_FSFHOMOGENEOUS; } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_MAXFILESIZE: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); tnfsquad.qval = fxdr_hyper(tl); if (compare) { if (!(*retcmpp)) { tquad = NFSRV_MAXFILESIZE; if (tquad != tnfsquad.qval) *retcmpp = NFSERR_NOTSAME; } } else if (fsp != NULL) { fsp->fs_maxfilesize = tnfsquad.qval; } attrsum += NFSX_HYPER; break; case NFSATTRBIT_MAXLINK: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (fxdr_unsigned(int, *tl) != NFS_LINK_MAX) *retcmpp = NFSERR_NOTSAME; } } else if (pc != NULL) { pc->pc_linkmax = fxdr_unsigned(u_int32_t, *tl); } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_MAXNAME: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (fsp->fs_maxname != fxdr_unsigned(u_int32_t, *tl)) *retcmpp = NFSERR_NOTSAME; } } else { tuint = fxdr_unsigned(u_int32_t, *tl); /* * Some Linux NFSv4 servers report this * as 0 or 4billion, so I'll set it to * NFS_MAXNAMLEN. If a server actually creates * a name longer than NFS_MAXNAMLEN, it will * get an error back. */ if (tuint == 0 || tuint > NFS_MAXNAMLEN) tuint = NFS_MAXNAMLEN; if (fsp != NULL) fsp->fs_maxname = tuint; if (pc != NULL) pc->pc_namemax = tuint; } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_MAXREAD: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (compare) { if (!(*retcmpp)) { if (fsp->fs_rtmax != fxdr_unsigned(u_int32_t, *(tl + 1)) || *tl != 0) *retcmpp = NFSERR_NOTSAME; } } else if (fsp != NULL) { fsp->fs_rtmax = fxdr_unsigned(u_int32_t, *++tl); fsp->fs_rtpref = fsp->fs_rtmax; fsp->fs_dtpref = fsp->fs_rtpref; } attrsum += NFSX_HYPER; break; case NFSATTRBIT_MAXWRITE: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (compare) { if (!(*retcmpp)) { if (fsp->fs_wtmax != fxdr_unsigned(u_int32_t, *(tl + 1)) || *tl != 0) *retcmpp = NFSERR_NOTSAME; } } else if (fsp != NULL) { fsp->fs_wtmax = fxdr_unsigned(int, *++tl); fsp->fs_wtpref = fsp->fs_wtmax; } attrsum += NFSX_HYPER; break; case NFSATTRBIT_MIMETYPE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i)); error = nfsm_advance(nd, NFSM_RNDUP(i), -1); if (error) goto nfsmout; if (compare && !(*retcmpp)) *retcmpp = NFSERR_ATTRNOTSUPP; break; case NFSATTRBIT_MODE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (nap->na_mode != nfstov_mode(*tl)) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_mode = nfstov_mode(*tl); } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_NOTRUNC: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { if (*tl != newnfs_true) *retcmpp = NFSERR_NOTSAME; } } else if (pc != NULL) { pc->pc_notrunc = fxdr_unsigned(u_int32_t, *tl); } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_NUMLINKS: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); tuint = fxdr_unsigned(u_int32_t, *tl); if (compare) { if (!(*retcmpp)) { if ((u_int32_t)nap->na_nlink != tuint) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_nlink = tuint; } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_OWNER: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); j = fxdr_unsigned(int, *tl); if (j < 0) { error = NFSERR_BADXDR; goto nfsmout; } attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j)); if (j > NFSV4_SMALLSTR) cp = malloc(j + 1, M_NFSSTRING, M_WAITOK); else cp = namestr; error = nfsrv_mtostr(nd, cp, j); if (error) { if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); goto nfsmout; } if (compare) { if (!(*retcmpp)) { if (nfsv4_strtouid(nd, cp, j, &uid) || nap->na_uid != uid) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { if (nfsv4_strtouid(nd, cp, j, &uid)) nap->na_uid = nfsrv_defaultuid; else nap->na_uid = uid; } if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); break; case NFSATTRBIT_OWNERGROUP: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); j = fxdr_unsigned(int, *tl); if (j < 0) { error = NFSERR_BADXDR; goto nfsmout; } attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j)); if (j > NFSV4_SMALLSTR) cp = malloc(j + 1, M_NFSSTRING, M_WAITOK); else cp = namestr; error = nfsrv_mtostr(nd, cp, j); if (error) { if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); goto nfsmout; } if (compare) { if (!(*retcmpp)) { if (nfsv4_strtogid(nd, cp, j, &gid) || nap->na_gid != gid) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { if (nfsv4_strtogid(nd, cp, j, &gid)) nap->na_gid = nfsrv_defaultgid; else nap->na_gid = gid; } if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); break; case NFSATTRBIT_QUOTAHARD: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (sbp != NULL) { if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA)) freenum = sbp->f_bfree; else freenum = sbp->f_bavail; #ifdef QUOTA /* * ufs_quotactl() insists that the uid argument * equal p_ruid for non-root quota access, so * we'll just make sure that's the case. */ savuid = p->p_cred->p_ruid; p->p_cred->p_ruid = cred->cr_uid; if (!VFS_QUOTACTL(vp->v_mount,QCMD(Q_GETQUOTA, USRQUOTA), cred->cr_uid, &dqb)) freenum = min(dqb.dqb_bhardlimit, freenum); p->p_cred->p_ruid = savuid; #endif /* QUOTA */ uquad = (u_int64_t)freenum; NFSQUOTABLKTOBYTE(uquad, sbp->f_bsize); } if (compare && !(*retcmpp)) { if (uquad != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } attrsum += NFSX_HYPER; break; case NFSATTRBIT_QUOTASOFT: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (sbp != NULL) { if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA)) freenum = sbp->f_bfree; else freenum = sbp->f_bavail; #ifdef QUOTA /* * ufs_quotactl() insists that the uid argument * equal p_ruid for non-root quota access, so * we'll just make sure that's the case. */ savuid = p->p_cred->p_ruid; p->p_cred->p_ruid = cred->cr_uid; if (!VFS_QUOTACTL(vp->v_mount,QCMD(Q_GETQUOTA, USRQUOTA), cred->cr_uid, &dqb)) freenum = min(dqb.dqb_bsoftlimit, freenum); p->p_cred->p_ruid = savuid; #endif /* QUOTA */ uquad = (u_int64_t)freenum; NFSQUOTABLKTOBYTE(uquad, sbp->f_bsize); } if (compare && !(*retcmpp)) { if (uquad != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } attrsum += NFSX_HYPER; break; case NFSATTRBIT_QUOTAUSED: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (sbp != NULL) { freenum = 0; #ifdef QUOTA /* * ufs_quotactl() insists that the uid argument * equal p_ruid for non-root quota access, so * we'll just make sure that's the case. */ savuid = p->p_cred->p_ruid; p->p_cred->p_ruid = cred->cr_uid; if (!VFS_QUOTACTL(vp->v_mount,QCMD(Q_GETQUOTA, USRQUOTA), cred->cr_uid, &dqb)) freenum = dqb.dqb_curblocks; p->p_cred->p_ruid = savuid; #endif /* QUOTA */ uquad = (u_int64_t)freenum; NFSQUOTABLKTOBYTE(uquad, sbp->f_bsize); } if (compare && !(*retcmpp)) { if (uquad != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } attrsum += NFSX_HYPER; break; case NFSATTRBIT_RAWDEV: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4SPECDATA); j = fxdr_unsigned(int, *tl++); k = fxdr_unsigned(int, *tl); if (compare) { if (!(*retcmpp)) { if (nap->na_rdev != NFSMAKEDEV(j, k)) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_rdev = NFSMAKEDEV(j, k); } attrsum += NFSX_V4SPECDATA; break; case NFSATTRBIT_SPACEAVAIL: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (compare) { if (!(*retcmpp) && sfp->sf_abytes != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } else if (sfp != NULL) { sfp->sf_abytes = fxdr_hyper(tl); } attrsum += NFSX_HYPER; break; case NFSATTRBIT_SPACEFREE: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (compare) { if (!(*retcmpp) && sfp->sf_fbytes != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } else if (sfp != NULL) { sfp->sf_fbytes = fxdr_hyper(tl); } attrsum += NFSX_HYPER; break; case NFSATTRBIT_SPACETOTAL: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (compare) { if (!(*retcmpp) && sfp->sf_tbytes != fxdr_hyper(tl)) *retcmpp = NFSERR_NOTSAME; } else if (sfp != NULL) { sfp->sf_tbytes = fxdr_hyper(tl); } attrsum += NFSX_HYPER; break; case NFSATTRBIT_SPACEUSED: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); thyp = fxdr_hyper(tl); if (compare) { if (!(*retcmpp)) { if ((u_int64_t)nap->na_bytes != thyp) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_bytes = thyp; } attrsum += NFSX_HYPER; break; case NFSATTRBIT_SYSTEM: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare && !(*retcmpp)) *retcmpp = NFSERR_ATTRNOTSUPP; attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_TIMEACCESS: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); fxdr_nfsv4time(tl, &temptime); if (compare) { if (!(*retcmpp)) { if (!NFS_CMPTIME(temptime, nap->na_atime)) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_atime = temptime; } attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEACCESSSET: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsum += NFSX_UNSIGNED; i = fxdr_unsigned(int, *tl); if (i == NFSV4SATTRTIME_TOCLIENT) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); attrsum += NFSX_V4TIME; } if (compare && !(*retcmpp)) *retcmpp = NFSERR_INVAL; break; case NFSATTRBIT_TIMEBACKUP: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); if (compare && !(*retcmpp)) *retcmpp = NFSERR_ATTRNOTSUPP; attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMECREATE: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); fxdr_nfsv4time(tl, &temptime); if (compare) { if (!(*retcmpp)) { if (!NFS_CMPTIME(temptime, nap->na_btime)) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_btime = temptime; } attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEDELTA: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); if (fsp != NULL) { if (compare) { if (!(*retcmpp)) { if ((u_int32_t)fsp->fs_timedelta.tv_sec != fxdr_unsigned(u_int32_t, *(tl + 1)) || (u_int32_t)fsp->fs_timedelta.tv_nsec != (fxdr_unsigned(u_int32_t, *(tl + 2)) % 1000000000) || *tl != 0) *retcmpp = NFSERR_NOTSAME; } } else { fxdr_nfsv4time(tl, &fsp->fs_timedelta); } } attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEMETADATA: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); fxdr_nfsv4time(tl, &temptime); if (compare) { if (!(*retcmpp)) { if (!NFS_CMPTIME(temptime, nap->na_ctime)) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_ctime = temptime; } attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEMODIFY: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); fxdr_nfsv4time(tl, &temptime); if (compare) { if (!(*retcmpp)) { if (!NFS_CMPTIME(temptime, nap->na_mtime)) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { nap->na_mtime = temptime; } attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEMODIFYSET: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsum += NFSX_UNSIGNED; i = fxdr_unsigned(int, *tl); if (i == NFSV4SATTRTIME_TOCLIENT) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); attrsum += NFSX_V4TIME; } if (compare && !(*retcmpp)) *retcmpp = NFSERR_INVAL; break; case NFSATTRBIT_MOUNTEDONFILEID: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); thyp = fxdr_hyper(tl); if (compare) { if (!(*retcmpp)) { if (!vp || !nfsrv_atroot(vp, &thyp2)) thyp2 = nap->na_fileid; if (thyp2 != thyp) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) nap->na_mntonfileno = thyp; attrsum += NFSX_HYPER; break; case NFSATTRBIT_SUPPATTREXCLCREAT: retnotsup = 0; error = nfsrv_getattrbits(nd, &retattrbits, &cnt, &retnotsup); if (error) goto nfsmout; if (compare && !(*retcmpp)) { NFSSETSUPP_ATTRBIT(&checkattrbits, nd); NFSCLRNOTSETABLE_ATTRBIT(&checkattrbits, nd); NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_TIMEACCESSSET); if (!NFSEQUAL_ATTRBIT(&retattrbits, &checkattrbits) || retnotsup) *retcmpp = NFSERR_NOTSAME; } attrsum += cnt; break; case NFSATTRBIT_FSLAYOUTTYPE: case NFSATTRBIT_LAYOUTTYPE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsum += NFSX_UNSIGNED; i = fxdr_unsigned(int, *tl); if (i > 0) { NFSM_DISSECT(tl, u_int32_t *, i * NFSX_UNSIGNED); attrsum += i * NFSX_UNSIGNED; j = fxdr_unsigned(int, *tl); if (i == 1 && compare && !(*retcmpp) && (((nfsrv_doflexfile != 0 || nfsrv_maxpnfsmirror > 1) && j != NFSLAYOUT_FLEXFILE) || (nfsrv_doflexfile == 0 && j != NFSLAYOUT_NFSV4_1_FILES))) *retcmpp = NFSERR_NOTSAME; } if (nfsrv_devidcnt == 0) { if (compare && !(*retcmpp) && i > 0) *retcmpp = NFSERR_NOTSAME; } else { if (compare && !(*retcmpp) && i != 1) *retcmpp = NFSERR_NOTSAME; } break; case NFSATTRBIT_LAYOUTALIGNMENT: case NFSATTRBIT_LAYOUTBLKSIZE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsum += NFSX_UNSIGNED; i = fxdr_unsigned(int, *tl); if (compare && !(*retcmpp) && i != nfs_srvmaxio) *retcmpp = NFSERR_NOTSAME; break; default: printf("EEK! nfsv4_loadattr unknown attr=%d\n", bitpos); if (compare && !(*retcmpp)) *retcmpp = NFSERR_ATTRNOTSUPP; /* * and get out of the loop, since we can't parse * the unknown attrbute data. */ bitpos = NFSATTRBIT_MAX; break; } } /* * some clients pad the attrlist, so we need to skip over the * padding. */ if (attrsum > attrsize) { error = NFSERR_BADXDR; } else { attrsize = NFSM_RNDUP(attrsize); if (attrsum < attrsize) error = nfsm_advance(nd, attrsize - attrsum, -1); } nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * Implement sleep locks for newnfs. The nfslock_usecnt allows for a * shared lock and the NFSXXX_LOCK flag permits an exclusive lock. * The first argument is a pointer to an nfsv4lock structure. * The second argument is 1 iff a blocking lock is wanted. * If this argument is 0, the call waits until no thread either wants nor * holds an exclusive lock. * It returns 1 if the lock was acquired, 0 otherwise. * If several processes call this function concurrently wanting the exclusive * lock, one will get the lock and the rest will return without getting the * lock. (If the caller must have the lock, it simply calls this function in a * loop until the function returns 1 to indicate the lock was acquired.) * Any usecnt must be decremented by calling nfsv4_relref() before * calling nfsv4_lock(). It was done this way, so nfsv4_lock() could * be called in a loop. * The isleptp argument is set to indicate if the call slept, iff not NULL * and the mp argument indicates to check for a forced dismount, iff not * NULL. */ int nfsv4_lock(struct nfsv4lock *lp, int iwantlock, int *isleptp, struct mtx *mutex, struct mount *mp) { if (isleptp) *isleptp = 0; /* * If a lock is wanted, loop around until the lock is acquired by * someone and then released. If I want the lock, try to acquire it. * For a lock to be issued, no lock must be in force and the usecnt * must be zero. */ if (iwantlock) { if (!(lp->nfslock_lock & NFSV4LOCK_LOCK) && lp->nfslock_usecnt == 0) { lp->nfslock_lock &= ~NFSV4LOCK_LOCKWANTED; lp->nfslock_lock |= NFSV4LOCK_LOCK; return (1); } lp->nfslock_lock |= NFSV4LOCK_LOCKWANTED; } while (lp->nfslock_lock & (NFSV4LOCK_LOCK | NFSV4LOCK_LOCKWANTED)) { if (mp != NULL && NFSCL_FORCEDISM(mp)) { lp->nfslock_lock &= ~NFSV4LOCK_LOCKWANTED; return (0); } lp->nfslock_lock |= NFSV4LOCK_WANTED; if (isleptp) *isleptp = 1; msleep(&lp->nfslock_lock, mutex, PVFS, "nfsv4lck", hz); if (iwantlock && !(lp->nfslock_lock & NFSV4LOCK_LOCK) && lp->nfslock_usecnt == 0) { lp->nfslock_lock &= ~NFSV4LOCK_LOCKWANTED; lp->nfslock_lock |= NFSV4LOCK_LOCK; return (1); } } return (0); } /* * Release the lock acquired by nfsv4_lock(). * The second argument is set to 1 to indicate the nfslock_usecnt should be * incremented, as well. */ void nfsv4_unlock(struct nfsv4lock *lp, int incref) { lp->nfslock_lock &= ~NFSV4LOCK_LOCK; if (incref) lp->nfslock_usecnt++; nfsv4_wanted(lp); } /* * Release a reference cnt. */ void nfsv4_relref(struct nfsv4lock *lp) { if (lp->nfslock_usecnt <= 0) panic("nfsv4root ref cnt"); lp->nfslock_usecnt--; if (lp->nfslock_usecnt == 0) nfsv4_wanted(lp); } /* * Get a reference cnt. * This function will wait for any exclusive lock to be released, but will * not wait for threads that want the exclusive lock. If priority needs * to be given to threads that need the exclusive lock, a call to nfsv4_lock() * with the 2nd argument == 0 should be done before calling nfsv4_getref(). * If the mp argument is not NULL, check for NFSCL_FORCEDISM() being set and * return without getting a refcnt for that case. */ void nfsv4_getref(struct nfsv4lock *lp, int *isleptp, struct mtx *mutex, struct mount *mp) { if (isleptp) *isleptp = 0; /* * Wait for a lock held. */ while (lp->nfslock_lock & NFSV4LOCK_LOCK) { if (mp != NULL && NFSCL_FORCEDISM(mp)) return; lp->nfslock_lock |= NFSV4LOCK_WANTED; if (isleptp) *isleptp = 1; msleep(&lp->nfslock_lock, mutex, PVFS, "nfsv4gr", hz); } if (mp != NULL && NFSCL_FORCEDISM(mp)) return; lp->nfslock_usecnt++; } /* * Get a reference as above, but return failure instead of sleeping if * an exclusive lock is held. */ int nfsv4_getref_nonblock(struct nfsv4lock *lp) { if ((lp->nfslock_lock & NFSV4LOCK_LOCK) != 0) return (0); lp->nfslock_usecnt++; return (1); } /* * Test for a lock. Return 1 if locked, 0 otherwise. */ int nfsv4_testlock(struct nfsv4lock *lp) { if ((lp->nfslock_lock & NFSV4LOCK_LOCK) == 0 && lp->nfslock_usecnt == 0) return (0); return (1); } /* * Wake up anyone sleeping, waiting for this lock. */ static void nfsv4_wanted(struct nfsv4lock *lp) { if (lp->nfslock_lock & NFSV4LOCK_WANTED) { lp->nfslock_lock &= ~NFSV4LOCK_WANTED; wakeup((caddr_t)&lp->nfslock_lock); } } /* * Copy a string from an mbuf list into a character array. * Return EBADRPC if there is an mbuf error, * 0 otherwise. */ int nfsrv_mtostr(struct nfsrv_descript *nd, char *str, int siz) { char *cp; int xfer, len; struct mbuf *mp; int rem, error = 0; mp = nd->nd_md; cp = nd->nd_dpos; len = mtod(mp, caddr_t) + mp->m_len - cp; rem = NFSM_RNDUP(siz) - siz; while (siz > 0) { if (len > siz) xfer = siz; else xfer = len; NFSBCOPY(cp, str, xfer); str += xfer; siz -= xfer; if (siz > 0) { mp = mp->m_next; if (mp == NULL) { error = EBADRPC; goto out; } cp = mtod(mp, caddr_t); len = mp->m_len; } else { cp += xfer; len -= xfer; } } *str = '\0'; nd->nd_dpos = cp; nd->nd_md = mp; if (rem > 0) { if (len < rem) error = nfsm_advance(nd, rem, len); else nd->nd_dpos += rem; } out: NFSEXITCODE2(error, nd); return (error); } /* * Fill in the attributes as marked by the bitmap (V4). */ int nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, NFSACL_T *saclp, struct vattr *vap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp, struct ucred *cred, NFSPROC_T *p, int isdgram, int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno, struct statfs *pnfssf) { int bitpos, retnum = 0; u_int32_t *tl; int siz, prefixnum, error; u_char *cp, namestr[NFSV4_SMALLSTR]; nfsattrbit_t attrbits, retbits; nfsattrbit_t *retbitp = &retbits; u_int32_t freenum, *retnump; u_int64_t uquad; struct statfs *fs; struct nfsfsinfo fsinf; struct timespec temptime; NFSACL_T *aclp, *naclp = NULL; size_t atsiz; bool xattrsupp; #ifdef QUOTA struct dqblk dqb; uid_t savuid; #endif /* * First, set the bits that can be filled and get fsinfo. */ NFSSET_ATTRBIT(retbitp, attrbitp); /* * If both p and cred are NULL, it is a client side setattr call. * If both p and cred are not NULL, it is a server side reply call. * If p is not NULL and cred is NULL, it is a client side callback * reply call. */ if (p == NULL && cred == NULL) { NFSCLRNOTSETABLE_ATTRBIT(retbitp, nd); aclp = saclp; } else { NFSCLRNOTFILLABLE_ATTRBIT(retbitp, nd); naclp = acl_alloc(M_WAITOK); aclp = naclp; } nfsvno_getfs(&fsinf, isdgram); #ifndef APPLE /* * Get the VFS_STATFS(), since some attributes need them. */ fs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); if (NFSISSETSTATFS_ATTRBIT(retbitp)) { error = VFS_STATFS(mp, fs); if (error != 0) { if (reterr) { nd->nd_repstat = NFSERR_ACCES; free(fs, M_STATFS); return (0); } NFSCLRSTATFS_ATTRBIT(retbitp); } } #endif /* * And the NFSv4 ACL... */ if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_ACLSUPPORT) && (nfsrv_useacl == 0 || ((cred != NULL || p != NULL) && supports_nfsv4acls == 0))) { NFSCLRBIT_ATTRBIT(retbitp, NFSATTRBIT_ACLSUPPORT); } if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_ACL)) { if (nfsrv_useacl == 0 || ((cred != NULL || p != NULL) && supports_nfsv4acls == 0)) { NFSCLRBIT_ATTRBIT(retbitp, NFSATTRBIT_ACL); } else if (naclp != NULL) { if (NFSVOPLOCK(vp, LK_SHARED) == 0) { error = VOP_ACCESSX(vp, VREAD_ACL, cred, p); if (error == 0) error = VOP_GETACL(vp, ACL_TYPE_NFS4, naclp, cred, p); NFSVOPUNLOCK(vp); } else error = NFSERR_PERM; if (error != 0) { if (reterr) { nd->nd_repstat = NFSERR_ACCES; free(fs, M_STATFS); return (0); } NFSCLRBIT_ATTRBIT(retbitp, NFSATTRBIT_ACL); } } } /* Check to see if Extended Attributes are supported. */ xattrsupp = false; if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_XATTRSUPPORT)) { if (NFSVOPLOCK(vp, LK_SHARED) == 0) { error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, "xxx", NULL, &atsiz, cred, p); NFSVOPUNLOCK(vp); if (error != EOPNOTSUPP) xattrsupp = true; } } /* * Put out the attribute bitmap for the ones being filled in * and get the field for the number of attributes returned. */ prefixnum = nfsrv_putattrbit(nd, retbitp); NFSM_BUILD(retnump, u_int32_t *, NFSX_UNSIGNED); prefixnum += NFSX_UNSIGNED; /* * Now, loop around filling in the attributes for each bit set. */ for (bitpos = 0; bitpos < NFSATTRBIT_MAX; bitpos++) { if (NFSISSET_ATTRBIT(retbitp, bitpos)) { switch (bitpos) { case NFSATTRBIT_SUPPORTEDATTRS: NFSSETSUPP_ATTRBIT(&attrbits, nd); if (nfsrv_useacl == 0 || ((cred != NULL || p != NULL) && supports_nfsv4acls == 0)) { NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACLSUPPORT); NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACL); } retnum += nfsrv_putattrbit(nd, &attrbits); break; case NFSATTRBIT_TYPE: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = vtonfsv34_type(vap->va_type); retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_FHEXPIRETYPE: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4FHTYPE_PERSISTENT); retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_CHANGE: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); txdr_hyper(vap->va_filerev, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_SIZE: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); txdr_hyper(vap->va_size, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_LINKSUPPORT: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (fsinf.fs_properties & NFSV3FSINFO_LINK) *tl = newnfs_true; else *tl = newnfs_false; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_SYMLINKSUPPORT: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (fsinf.fs_properties & NFSV3FSINFO_SYMLINK) *tl = newnfs_true; else *tl = newnfs_false; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_NAMEDATTR: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_false; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_FSID: NFSM_BUILD(tl, u_int32_t *, NFSX_V4FSID); *tl++ = 0; *tl++ = txdr_unsigned(mp->mnt_stat.f_fsid.val[0]); *tl++ = 0; *tl = txdr_unsigned(mp->mnt_stat.f_fsid.val[1]); retnum += NFSX_V4FSID; break; case NFSATTRBIT_UNIQUEHANDLES: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_true; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_LEASETIME: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(nfsrv_lease); retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_RDATTRERROR: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(rderror); retnum += NFSX_UNSIGNED; break; /* * Recommended Attributes. (Only the supported ones.) */ case NFSATTRBIT_ACL: retnum += nfsrv_buildacl(nd, aclp, vnode_vtype(vp), p); break; case NFSATTRBIT_ACLSUPPORT: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4ACE_SUPTYPES); retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_CANSETTIME: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (fsinf.fs_properties & NFSV3FSINFO_CANSETTIME) *tl = newnfs_true; else *tl = newnfs_false; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_CASEINSENSITIVE: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_false; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_CASEPRESERVING: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_true; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_CHOWNRESTRICTED: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_true; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_FILEHANDLE: retnum += nfsm_fhtom(nd, (u_int8_t *)fhp, 0, 0); break; case NFSATTRBIT_FILEID: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); uquad = vap->va_fileid; txdr_hyper(uquad, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_FILESAVAIL: /* * Check quota and use min(quota, f_ffree). */ freenum = fs->f_ffree; #ifdef QUOTA /* * ufs_quotactl() insists that the uid argument * equal p_ruid for non-root quota access, so * we'll just make sure that's the case. */ savuid = p->p_cred->p_ruid; p->p_cred->p_ruid = cred->cr_uid; if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA), cred->cr_uid, &dqb)) freenum = min(dqb.dqb_isoftlimit-dqb.dqb_curinodes, freenum); p->p_cred->p_ruid = savuid; #endif /* QUOTA */ NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); *tl++ = 0; *tl = txdr_unsigned(freenum); retnum += NFSX_HYPER; break; case NFSATTRBIT_FILESFREE: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); *tl++ = 0; *tl = txdr_unsigned(fs->f_ffree); retnum += NFSX_HYPER; break; case NFSATTRBIT_FILESTOTAL: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); *tl++ = 0; *tl = txdr_unsigned(fs->f_files); retnum += NFSX_HYPER; break; case NFSATTRBIT_FSLOCATIONS: NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = 0; *tl = 0; retnum += 2 * NFSX_UNSIGNED; break; case NFSATTRBIT_HOMOGENEOUS: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (fsinf.fs_properties & NFSV3FSINFO_HOMOGENEOUS) *tl = newnfs_true; else *tl = newnfs_false; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_MAXFILESIZE: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); uquad = NFSRV_MAXFILESIZE; txdr_hyper(uquad, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_MAXLINK: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFS_LINK_MAX); retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_MAXNAME: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFS_MAXNAMLEN); retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_MAXREAD: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); *tl++ = 0; *tl = txdr_unsigned(fsinf.fs_rtmax); retnum += NFSX_HYPER; break; case NFSATTRBIT_MAXWRITE: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); *tl++ = 0; *tl = txdr_unsigned(fsinf.fs_wtmax); retnum += NFSX_HYPER; break; case NFSATTRBIT_MODE: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = vtonfsv34_mode(vap->va_mode); retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_NOTRUNC: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_true; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_NUMLINKS: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(vap->va_nlink); retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_OWNER: cp = namestr; nfsv4_uidtostr(vap->va_uid, &cp, &siz); retnum += nfsm_strtom(nd, cp, siz); if (cp != namestr) free(cp, M_NFSSTRING); break; case NFSATTRBIT_OWNERGROUP: cp = namestr; nfsv4_gidtostr(vap->va_gid, &cp, &siz); retnum += nfsm_strtom(nd, cp, siz); if (cp != namestr) free(cp, M_NFSSTRING); break; case NFSATTRBIT_QUOTAHARD: if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA)) freenum = fs->f_bfree; else freenum = fs->f_bavail; #ifdef QUOTA /* * ufs_quotactl() insists that the uid argument * equal p_ruid for non-root quota access, so * we'll just make sure that's the case. */ savuid = p->p_cred->p_ruid; p->p_cred->p_ruid = cred->cr_uid; if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA), cred->cr_uid, &dqb)) freenum = min(dqb.dqb_bhardlimit, freenum); p->p_cred->p_ruid = savuid; #endif /* QUOTA */ NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); uquad = (u_int64_t)freenum; NFSQUOTABLKTOBYTE(uquad, fs->f_bsize); txdr_hyper(uquad, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_QUOTASOFT: if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA)) freenum = fs->f_bfree; else freenum = fs->f_bavail; #ifdef QUOTA /* * ufs_quotactl() insists that the uid argument * equal p_ruid for non-root quota access, so * we'll just make sure that's the case. */ savuid = p->p_cred->p_ruid; p->p_cred->p_ruid = cred->cr_uid; if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA), cred->cr_uid, &dqb)) freenum = min(dqb.dqb_bsoftlimit, freenum); p->p_cred->p_ruid = savuid; #endif /* QUOTA */ NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); uquad = (u_int64_t)freenum; NFSQUOTABLKTOBYTE(uquad, fs->f_bsize); txdr_hyper(uquad, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_QUOTAUSED: freenum = 0; #ifdef QUOTA /* * ufs_quotactl() insists that the uid argument * equal p_ruid for non-root quota access, so * we'll just make sure that's the case. */ savuid = p->p_cred->p_ruid; p->p_cred->p_ruid = cred->cr_uid; if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA), cred->cr_uid, &dqb)) freenum = dqb.dqb_curblocks; p->p_cred->p_ruid = savuid; #endif /* QUOTA */ NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); uquad = (u_int64_t)freenum; NFSQUOTABLKTOBYTE(uquad, fs->f_bsize); txdr_hyper(uquad, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_RAWDEV: NFSM_BUILD(tl, u_int32_t *, NFSX_V4SPECDATA); *tl++ = txdr_unsigned(NFSMAJOR(vap->va_rdev)); *tl = txdr_unsigned(NFSMINOR(vap->va_rdev)); retnum += NFSX_V4SPECDATA; break; case NFSATTRBIT_SPACEAVAIL: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE)) { if (pnfssf != NULL) uquad = (u_int64_t)pnfssf->f_bfree; else uquad = (u_int64_t)fs->f_bfree; } else { if (pnfssf != NULL) uquad = (u_int64_t)pnfssf->f_bavail; else uquad = (u_int64_t)fs->f_bavail; } if (pnfssf != NULL) uquad *= pnfssf->f_bsize; else uquad *= fs->f_bsize; txdr_hyper(uquad, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_SPACEFREE: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); if (pnfssf != NULL) { uquad = (u_int64_t)pnfssf->f_bfree; uquad *= pnfssf->f_bsize; } else { uquad = (u_int64_t)fs->f_bfree; uquad *= fs->f_bsize; } txdr_hyper(uquad, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_SPACETOTAL: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); if (pnfssf != NULL) { uquad = (u_int64_t)pnfssf->f_blocks; uquad *= pnfssf->f_bsize; } else { uquad = (u_int64_t)fs->f_blocks; uquad *= fs->f_bsize; } txdr_hyper(uquad, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_SPACEUSED: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); txdr_hyper(vap->va_bytes, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_TIMEACCESS: NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME); txdr_nfsv4time(&vap->va_atime, tl); retnum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEACCESSSET: if ((vap->va_vaflags & VA_UTIMES_NULL) == 0) { NFSM_BUILD(tl, u_int32_t *, NFSX_V4SETTIME); *tl++ = txdr_unsigned(NFSV4SATTRTIME_TOCLIENT); txdr_nfsv4time(&vap->va_atime, tl); retnum += NFSX_V4SETTIME; } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4SATTRTIME_TOSERVER); retnum += NFSX_UNSIGNED; } break; case NFSATTRBIT_TIMEDELTA: NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME); temptime.tv_sec = 0; temptime.tv_nsec = 1000000000 / hz; txdr_nfsv4time(&temptime, tl); retnum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEMETADATA: NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME); txdr_nfsv4time(&vap->va_ctime, tl); retnum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEMODIFY: NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME); txdr_nfsv4time(&vap->va_mtime, tl); retnum += NFSX_V4TIME; break; case NFSATTRBIT_TIMECREATE: NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME); txdr_nfsv4time(&vap->va_birthtime, tl); retnum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEMODIFYSET: if ((vap->va_vaflags & VA_UTIMES_NULL) == 0) { NFSM_BUILD(tl, u_int32_t *, NFSX_V4SETTIME); *tl++ = txdr_unsigned(NFSV4SATTRTIME_TOCLIENT); txdr_nfsv4time(&vap->va_mtime, tl); retnum += NFSX_V4SETTIME; } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4SATTRTIME_TOSERVER); retnum += NFSX_UNSIGNED; } break; case NFSATTRBIT_MOUNTEDONFILEID: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); if (at_root != 0) uquad = mounted_on_fileno; else uquad = vap->va_fileid; txdr_hyper(uquad, tl); retnum += NFSX_HYPER; break; case NFSATTRBIT_SUPPATTREXCLCREAT: NFSSETSUPP_ATTRBIT(&attrbits, nd); NFSCLRNOTSETABLE_ATTRBIT(&attrbits, nd); NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET); retnum += nfsrv_putattrbit(nd, &attrbits); break; case NFSATTRBIT_FSLAYOUTTYPE: case NFSATTRBIT_LAYOUTTYPE: if (nfsrv_devidcnt == 0) siz = 1; else siz = 2; if (siz == 2) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(1); /* One entry. */ if (nfsrv_doflexfile != 0 || nfsrv_maxpnfsmirror > 1) *tl = txdr_unsigned(NFSLAYOUT_FLEXFILE); else *tl = txdr_unsigned( NFSLAYOUT_NFSV4_1_FILES); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = 0; } retnum += siz * NFSX_UNSIGNED; break; case NFSATTRBIT_LAYOUTALIGNMENT: case NFSATTRBIT_LAYOUTBLKSIZE: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(nfs_srvmaxio); retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_XATTRSUPPORT: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (xattrsupp) *tl = newnfs_true; else *tl = newnfs_false; retnum += NFSX_UNSIGNED; break; default: printf("EEK! Bad V4 attribute bitpos=%d\n", bitpos); } } } if (naclp != NULL) acl_free(naclp); free(fs, M_STATFS); *retnump = txdr_unsigned(retnum); return (retnum + prefixnum); } /* * Put the attribute bits onto an mbuf list. * Return the number of bytes of output generated. */ int nfsrv_putattrbit(struct nfsrv_descript *nd, nfsattrbit_t *attrbitp) { u_int32_t *tl; int cnt, i, bytesize; for (cnt = NFSATTRBIT_MAXWORDS; cnt > 0; cnt--) if (attrbitp->bits[cnt - 1]) break; bytesize = (cnt + 1) * NFSX_UNSIGNED; NFSM_BUILD(tl, u_int32_t *, bytesize); *tl++ = txdr_unsigned(cnt); for (i = 0; i < cnt; i++) *tl++ = txdr_unsigned(attrbitp->bits[i]); return (bytesize); } /* * Convert a uid to a string. * If the lookup fails, just output the digits. * uid - the user id * cpp - points to a buffer of size NFSV4_SMALLSTR * (malloc a larger one, as required) * retlenp - pointer to length to be returned */ void nfsv4_uidtostr(uid_t uid, u_char **cpp, int *retlenp) { int i; struct nfsusrgrp *usrp; u_char *cp = *cpp; uid_t tmp; int cnt, hasampersand, len = NFSV4_SMALLSTR, ret; struct nfsrv_lughash *hp; cnt = 0; tryagain: if (nfsrv_dnsnamelen > 0 && !nfs_enable_uidtostring) { /* * Always map nfsrv_defaultuid to "nobody". */ if (uid == nfsrv_defaultuid) { i = nfsrv_dnsnamelen + 7; if (i > len) { if (len > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); cp = malloc(i, M_NFSSTRING, M_WAITOK); *cpp = cp; len = i; goto tryagain; } *retlenp = i; NFSBCOPY("nobody@", cp, 7); cp += 7; NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen); return; } hasampersand = 0; hp = NFSUSERHASH(uid); mtx_lock(&hp->mtx); TAILQ_FOREACH(usrp, &hp->lughead, lug_numhash) { if (usrp->lug_uid == uid) { if (usrp->lug_expiry < NFSD_MONOSEC) break; /* * If the name doesn't already have an '@' * in it, append @domainname to it. */ for (i = 0; i < usrp->lug_namelen; i++) { if (usrp->lug_name[i] == '@') { hasampersand = 1; break; } } if (hasampersand) i = usrp->lug_namelen; else i = usrp->lug_namelen + nfsrv_dnsnamelen + 1; if (i > len) { mtx_unlock(&hp->mtx); if (len > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); cp = malloc(i, M_NFSSTRING, M_WAITOK); *cpp = cp; len = i; goto tryagain; } *retlenp = i; NFSBCOPY(usrp->lug_name, cp, usrp->lug_namelen); if (!hasampersand) { cp += usrp->lug_namelen; *cp++ = '@'; NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen); } TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash); TAILQ_INSERT_TAIL(&hp->lughead, usrp, lug_numhash); mtx_unlock(&hp->mtx); return; } } mtx_unlock(&hp->mtx); cnt++; ret = nfsrv_getuser(RPCNFSUSERD_GETUID, uid, (gid_t)0, NULL); if (ret == 0 && cnt < 2) goto tryagain; } /* * No match, just return a string of digits. */ tmp = uid; i = 0; while (tmp || i == 0) { tmp /= 10; i++; } len = (i > len) ? len : i; *retlenp = len; cp += (len - 1); tmp = uid; for (i = 0; i < len; i++) { *cp-- = '0' + (tmp % 10); tmp /= 10; } return; } /* * Get a credential for the uid with the server's group list. * If none is found, just return the credential passed in after * logging a warning message. */ struct ucred * nfsrv_getgrpscred(struct ucred *oldcred) { struct nfsusrgrp *usrp; struct ucred *newcred; int cnt, ret; uid_t uid; struct nfsrv_lughash *hp; cnt = 0; uid = oldcred->cr_uid; tryagain: if (nfsrv_dnsnamelen > 0) { hp = NFSUSERHASH(uid); mtx_lock(&hp->mtx); TAILQ_FOREACH(usrp, &hp->lughead, lug_numhash) { if (usrp->lug_uid == uid) { if (usrp->lug_expiry < NFSD_MONOSEC) break; if (usrp->lug_cred != NULL) { newcred = crhold(usrp->lug_cred); crfree(oldcred); } else newcred = oldcred; TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash); TAILQ_INSERT_TAIL(&hp->lughead, usrp, lug_numhash); mtx_unlock(&hp->mtx); return (newcred); } } mtx_unlock(&hp->mtx); cnt++; ret = nfsrv_getuser(RPCNFSUSERD_GETUID, uid, (gid_t)0, NULL); if (ret == 0 && cnt < 2) goto tryagain; } return (oldcred); } /* * Convert a string to a uid. * If no conversion is possible return NFSERR_BADOWNER, otherwise * return 0. * If this is called from a client side mount using AUTH_SYS and the * string is made up entirely of digits, just convert the string to * a number. */ int nfsv4_strtouid(struct nfsrv_descript *nd, u_char *str, int len, uid_t *uidp) { int i; char *cp, *endstr, *str0; struct nfsusrgrp *usrp; int cnt, ret; int error = 0; uid_t tuid; struct nfsrv_lughash *hp, *hp2; if (len == 0) { error = NFSERR_BADOWNER; goto out; } /* If a string of digits and an AUTH_SYS mount, just convert it. */ str0 = str; tuid = (uid_t)strtoul(str0, &endstr, 10); if ((endstr - str0) == len) { /* A numeric string. */ if ((nd->nd_flag & ND_KERBV) == 0 && ((nd->nd_flag & ND_NFSCL) != 0 || nfsd_enable_stringtouid != 0)) *uidp = tuid; else error = NFSERR_BADOWNER; goto out; } /* * Look for an '@'. */ cp = strchr(str0, '@'); if (cp != NULL) i = (int)(cp++ - str0); else i = len; cnt = 0; tryagain: if (nfsrv_dnsnamelen > 0) { /* * If an '@' is found and the domain name matches, search for * the name with dns stripped off. * Mixed case alpahbetics will match for the domain name, but * all upper case will not. */ if (cnt == 0 && i < len && i > 0 && (len - 1 - i) == nfsrv_dnsnamelen && !nfsrv_cmpmixedcase(cp, nfsrv_dnsname, nfsrv_dnsnamelen)) { len -= (nfsrv_dnsnamelen + 1); *(cp - 1) = '\0'; } /* * Check for the special case of "nobody". */ if (len == 6 && !NFSBCMP(str, "nobody", 6)) { *uidp = nfsrv_defaultuid; error = 0; goto out; } hp = NFSUSERNAMEHASH(str, len); mtx_lock(&hp->mtx); TAILQ_FOREACH(usrp, &hp->lughead, lug_namehash) { if (usrp->lug_namelen == len && !NFSBCMP(usrp->lug_name, str, len)) { if (usrp->lug_expiry < NFSD_MONOSEC) break; hp2 = NFSUSERHASH(usrp->lug_uid); mtx_lock(&hp2->mtx); TAILQ_REMOVE(&hp2->lughead, usrp, lug_numhash); TAILQ_INSERT_TAIL(&hp2->lughead, usrp, lug_numhash); *uidp = usrp->lug_uid; mtx_unlock(&hp2->mtx); mtx_unlock(&hp->mtx); error = 0; goto out; } } mtx_unlock(&hp->mtx); cnt++; ret = nfsrv_getuser(RPCNFSUSERD_GETUSER, (uid_t)0, (gid_t)0, str); if (ret == 0 && cnt < 2) goto tryagain; } error = NFSERR_BADOWNER; out: NFSEXITCODE(error); return (error); } /* * Convert a gid to a string. * gid - the group id * cpp - points to a buffer of size NFSV4_SMALLSTR * (malloc a larger one, as required) * retlenp - pointer to length to be returned */ void nfsv4_gidtostr(gid_t gid, u_char **cpp, int *retlenp) { int i; struct nfsusrgrp *usrp; u_char *cp = *cpp; gid_t tmp; int cnt, hasampersand, len = NFSV4_SMALLSTR, ret; struct nfsrv_lughash *hp; cnt = 0; tryagain: if (nfsrv_dnsnamelen > 0 && !nfs_enable_uidtostring) { /* * Always map nfsrv_defaultgid to "nogroup". */ if (gid == nfsrv_defaultgid) { i = nfsrv_dnsnamelen + 8; if (i > len) { if (len > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); cp = malloc(i, M_NFSSTRING, M_WAITOK); *cpp = cp; len = i; goto tryagain; } *retlenp = i; NFSBCOPY("nogroup@", cp, 8); cp += 8; NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen); return; } hasampersand = 0; hp = NFSGROUPHASH(gid); mtx_lock(&hp->mtx); TAILQ_FOREACH(usrp, &hp->lughead, lug_numhash) { if (usrp->lug_gid == gid) { if (usrp->lug_expiry < NFSD_MONOSEC) break; /* * If the name doesn't already have an '@' * in it, append @domainname to it. */ for (i = 0; i < usrp->lug_namelen; i++) { if (usrp->lug_name[i] == '@') { hasampersand = 1; break; } } if (hasampersand) i = usrp->lug_namelen; else i = usrp->lug_namelen + nfsrv_dnsnamelen + 1; if (i > len) { mtx_unlock(&hp->mtx); if (len > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); cp = malloc(i, M_NFSSTRING, M_WAITOK); *cpp = cp; len = i; goto tryagain; } *retlenp = i; NFSBCOPY(usrp->lug_name, cp, usrp->lug_namelen); if (!hasampersand) { cp += usrp->lug_namelen; *cp++ = '@'; NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen); } TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash); TAILQ_INSERT_TAIL(&hp->lughead, usrp, lug_numhash); mtx_unlock(&hp->mtx); return; } } mtx_unlock(&hp->mtx); cnt++; ret = nfsrv_getuser(RPCNFSUSERD_GETGID, (uid_t)0, gid, NULL); if (ret == 0 && cnt < 2) goto tryagain; } /* * No match, just return a string of digits. */ tmp = gid; i = 0; while (tmp || i == 0) { tmp /= 10; i++; } len = (i > len) ? len : i; *retlenp = len; cp += (len - 1); tmp = gid; for (i = 0; i < len; i++) { *cp-- = '0' + (tmp % 10); tmp /= 10; } return; } /* * Convert a string to a gid. * If no conversion is possible return NFSERR_BADOWNER, otherwise * return 0. * If this is called from a client side mount using AUTH_SYS and the * string is made up entirely of digits, just convert the string to * a number. */ int nfsv4_strtogid(struct nfsrv_descript *nd, u_char *str, int len, gid_t *gidp) { int i; char *cp, *endstr, *str0; struct nfsusrgrp *usrp; int cnt, ret; int error = 0; gid_t tgid; struct nfsrv_lughash *hp, *hp2; if (len == 0) { error = NFSERR_BADOWNER; goto out; } /* If a string of digits and an AUTH_SYS mount, just convert it. */ str0 = str; tgid = (gid_t)strtoul(str0, &endstr, 10); if ((endstr - str0) == len) { /* A numeric string. */ if ((nd->nd_flag & ND_KERBV) == 0 && ((nd->nd_flag & ND_NFSCL) != 0 || nfsd_enable_stringtouid != 0)) *gidp = tgid; else error = NFSERR_BADOWNER; goto out; } /* * Look for an '@'. */ cp = strchr(str0, '@'); if (cp != NULL) i = (int)(cp++ - str0); else i = len; cnt = 0; tryagain: if (nfsrv_dnsnamelen > 0) { /* * If an '@' is found and the dns name matches, search for the * name with the dns stripped off. */ if (cnt == 0 && i < len && i > 0 && (len - 1 - i) == nfsrv_dnsnamelen && !nfsrv_cmpmixedcase(cp, nfsrv_dnsname, nfsrv_dnsnamelen)) { len -= (nfsrv_dnsnamelen + 1); *(cp - 1) = '\0'; } /* * Check for the special case of "nogroup". */ if (len == 7 && !NFSBCMP(str, "nogroup", 7)) { *gidp = nfsrv_defaultgid; error = 0; goto out; } hp = NFSGROUPNAMEHASH(str, len); mtx_lock(&hp->mtx); TAILQ_FOREACH(usrp, &hp->lughead, lug_namehash) { if (usrp->lug_namelen == len && !NFSBCMP(usrp->lug_name, str, len)) { if (usrp->lug_expiry < NFSD_MONOSEC) break; hp2 = NFSGROUPHASH(usrp->lug_gid); mtx_lock(&hp2->mtx); TAILQ_REMOVE(&hp2->lughead, usrp, lug_numhash); TAILQ_INSERT_TAIL(&hp2->lughead, usrp, lug_numhash); *gidp = usrp->lug_gid; mtx_unlock(&hp2->mtx); mtx_unlock(&hp->mtx); error = 0; goto out; } } mtx_unlock(&hp->mtx); cnt++; ret = nfsrv_getuser(RPCNFSUSERD_GETGROUP, (uid_t)0, (gid_t)0, str); if (ret == 0 && cnt < 2) goto tryagain; } error = NFSERR_BADOWNER; out: NFSEXITCODE(error); return (error); } /* * Cmp len chars, allowing mixed case in the first argument to match lower * case in the second, but not if the first argument is all upper case. * Return 0 for a match, 1 otherwise. */ static int nfsrv_cmpmixedcase(u_char *cp, u_char *cp2, int len) { int i; u_char tmp; int fndlower = 0; for (i = 0; i < len; i++) { if (*cp >= 'A' && *cp <= 'Z') { tmp = *cp++ + ('a' - 'A'); } else { tmp = *cp++; if (tmp >= 'a' && tmp <= 'z') fndlower = 1; } if (tmp != *cp2++) return (1); } if (fndlower) return (0); else return (1); } /* * Set the port for the nfsuserd. */ int nfsrv_nfsuserdport(struct nfsuserd_args *nargs, NFSPROC_T *p) { struct nfssockreq *rp; #ifdef INET struct sockaddr_in *ad; #endif #ifdef INET6 struct sockaddr_in6 *ad6; const struct in6_addr in6loopback = IN6ADDR_LOOPBACK_INIT; #endif int error; NFSLOCKNAMEID(); if (nfsrv_nfsuserd != NOTRUNNING) { NFSUNLOCKNAMEID(); error = EPERM; goto out; } nfsrv_nfsuserd = STARTSTOP; /* * Set up the socket record and connect. * Set nr_client NULL before unlocking, just to ensure that no other * process/thread/core will use a bogus old value. This could only * occur if the use of the nameid lock to protect nfsrv_nfsuserd is * broken. */ rp = &nfsrv_nfsuserdsock; rp->nr_client = NULL; NFSUNLOCKNAMEID(); rp->nr_sotype = SOCK_DGRAM; rp->nr_soproto = IPPROTO_UDP; rp->nr_lock = (NFSR_RESERVEDPORT | NFSR_LOCALHOST); rp->nr_cred = NULL; rp->nr_prog = RPCPROG_NFSUSERD; error = 0; switch (nargs->nuserd_family) { #ifdef INET case AF_INET: rp->nr_nam = malloc(sizeof(struct sockaddr_in), M_SONAME, M_WAITOK | M_ZERO); ad = (struct sockaddr_in *)rp->nr_nam; ad->sin_len = sizeof(struct sockaddr_in); ad->sin_family = AF_INET; ad->sin_addr.s_addr = htonl(INADDR_LOOPBACK); ad->sin_port = nargs->nuserd_port; break; #endif #ifdef INET6 case AF_INET6: rp->nr_nam = malloc(sizeof(struct sockaddr_in6), M_SONAME, M_WAITOK | M_ZERO); ad6 = (struct sockaddr_in6 *)rp->nr_nam; ad6->sin6_len = sizeof(struct sockaddr_in6); ad6->sin6_family = AF_INET6; ad6->sin6_addr = in6loopback; ad6->sin6_port = nargs->nuserd_port; break; #endif default: error = ENXIO; } rp->nr_vers = RPCNFSUSERD_VERS; if (error == 0) error = newnfs_connect(NULL, rp, NFSPROCCRED(p), p, 0, false, &rp->nr_client); if (error == 0) { NFSLOCKNAMEID(); nfsrv_nfsuserd = RUNNING; NFSUNLOCKNAMEID(); } else { free(rp->nr_nam, M_SONAME); NFSLOCKNAMEID(); nfsrv_nfsuserd = NOTRUNNING; NFSUNLOCKNAMEID(); } out: NFSEXITCODE(error); return (error); } /* * Delete the nfsuserd port. */ void nfsrv_nfsuserddelport(void) { NFSLOCKNAMEID(); if (nfsrv_nfsuserd != RUNNING) { NFSUNLOCKNAMEID(); return; } nfsrv_nfsuserd = STARTSTOP; /* Wait for all upcalls to complete. */ while (nfsrv_userdupcalls > 0) msleep(&nfsrv_userdupcalls, NFSNAMEIDMUTEXPTR, PVFS, "nfsupcalls", 0); NFSUNLOCKNAMEID(); newnfs_disconnect(NULL, &nfsrv_nfsuserdsock); free(nfsrv_nfsuserdsock.nr_nam, M_SONAME); NFSLOCKNAMEID(); nfsrv_nfsuserd = NOTRUNNING; NFSUNLOCKNAMEID(); } /* * Do upcalls to the nfsuserd, for cache misses of the owner/ownergroup * name<-->id cache. * Returns 0 upon success, non-zero otherwise. */ static int nfsrv_getuser(int procnum, uid_t uid, gid_t gid, char *name) { u_int32_t *tl; struct nfsrv_descript *nd; int len; struct nfsrv_descript nfsd; struct ucred *cred; int error; NFSLOCKNAMEID(); if (nfsrv_nfsuserd != RUNNING) { NFSUNLOCKNAMEID(); error = EPERM; goto out; } /* * Maintain a count of upcalls in progress, so that nfsrv_X() * can wait until no upcalls are in progress. */ nfsrv_userdupcalls++; NFSUNLOCKNAMEID(); KASSERT(nfsrv_userdupcalls > 0, ("nfsrv_getuser: non-positive upcalls")); nd = &nfsd; cred = newnfs_getcred(); nd->nd_flag = ND_GSSINITREPLY; nfsrvd_rephead(nd); nd->nd_procnum = procnum; if (procnum == RPCNFSUSERD_GETUID || procnum == RPCNFSUSERD_GETGID) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (procnum == RPCNFSUSERD_GETUID) *tl = txdr_unsigned(uid); else *tl = txdr_unsigned(gid); } else { len = strlen(name); (void) nfsm_strtom(nd, name, len); } error = newnfs_request(nd, NULL, NULL, &nfsrv_nfsuserdsock, NULL, NULL, cred, RPCPROG_NFSUSERD, RPCNFSUSERD_VERS, NULL, 0, NULL, NULL); NFSLOCKNAMEID(); if (--nfsrv_userdupcalls == 0 && nfsrv_nfsuserd == STARTSTOP) wakeup(&nfsrv_userdupcalls); NFSUNLOCKNAMEID(); NFSFREECRED(cred); if (!error) { m_freem(nd->nd_mrep); error = nd->nd_repstat; } out: NFSEXITCODE(error); return (error); } /* * This function is called from the nfssvc(2) system call, to update the * kernel user/group name list(s) for the V4 owner and ownergroup attributes. */ int nfssvc_idname(struct nfsd_idargs *nidp) { struct nfsusrgrp *nusrp, *usrp, *newusrp; struct nfsrv_lughash *hp_name, *hp_idnum, *thp; int i, group_locked, groupname_locked, user_locked, username_locked; int error = 0; u_char *cp; gid_t *grps; struct ucred *cr; static int onethread = 0; static time_t lasttime = 0; if (nidp->nid_namelen <= 0 || nidp->nid_namelen > MAXHOSTNAMELEN) { error = EINVAL; goto out; } if (nidp->nid_flag & NFSID_INITIALIZE) { cp = malloc(nidp->nid_namelen + 1, M_NFSSTRING, M_WAITOK); error = copyin(nidp->nid_name, cp, nidp->nid_namelen); if (error != 0) { free(cp, M_NFSSTRING); goto out; } if (atomic_cmpset_acq_int(&nfsrv_dnsnamelen, 0, 0) == 0) { /* * Free up all the old stuff and reinitialize hash * lists. All mutexes for both lists must be locked, * with the user/group name ones before the uid/gid * ones, to avoid a LOR. */ for (i = 0; i < nfsrv_lughashsize; i++) mtx_lock(&nfsusernamehash[i].mtx); for (i = 0; i < nfsrv_lughashsize; i++) mtx_lock(&nfsuserhash[i].mtx); for (i = 0; i < nfsrv_lughashsize; i++) TAILQ_FOREACH_SAFE(usrp, &nfsuserhash[i].lughead, lug_numhash, nusrp) nfsrv_removeuser(usrp, 1); for (i = 0; i < nfsrv_lughashsize; i++) mtx_unlock(&nfsuserhash[i].mtx); for (i = 0; i < nfsrv_lughashsize; i++) mtx_unlock(&nfsusernamehash[i].mtx); for (i = 0; i < nfsrv_lughashsize; i++) mtx_lock(&nfsgroupnamehash[i].mtx); for (i = 0; i < nfsrv_lughashsize; i++) mtx_lock(&nfsgrouphash[i].mtx); for (i = 0; i < nfsrv_lughashsize; i++) TAILQ_FOREACH_SAFE(usrp, &nfsgrouphash[i].lughead, lug_numhash, nusrp) nfsrv_removeuser(usrp, 0); for (i = 0; i < nfsrv_lughashsize; i++) mtx_unlock(&nfsgrouphash[i].mtx); for (i = 0; i < nfsrv_lughashsize; i++) mtx_unlock(&nfsgroupnamehash[i].mtx); free(nfsrv_dnsname, M_NFSSTRING); nfsrv_dnsname = NULL; } if (nfsuserhash == NULL) { /* Allocate the hash tables. */ nfsuserhash = malloc(sizeof(struct nfsrv_lughash) * nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK | M_ZERO); for (i = 0; i < nfsrv_lughashsize; i++) mtx_init(&nfsuserhash[i].mtx, "nfsuidhash", NULL, MTX_DEF | MTX_DUPOK); nfsusernamehash = malloc(sizeof(struct nfsrv_lughash) * nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK | M_ZERO); for (i = 0; i < nfsrv_lughashsize; i++) mtx_init(&nfsusernamehash[i].mtx, "nfsusrhash", NULL, MTX_DEF | MTX_DUPOK); nfsgrouphash = malloc(sizeof(struct nfsrv_lughash) * nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK | M_ZERO); for (i = 0; i < nfsrv_lughashsize; i++) mtx_init(&nfsgrouphash[i].mtx, "nfsgidhash", NULL, MTX_DEF | MTX_DUPOK); nfsgroupnamehash = malloc(sizeof(struct nfsrv_lughash) * nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK | M_ZERO); for (i = 0; i < nfsrv_lughashsize; i++) mtx_init(&nfsgroupnamehash[i].mtx, "nfsgrphash", NULL, MTX_DEF | MTX_DUPOK); } /* (Re)initialize the list heads. */ for (i = 0; i < nfsrv_lughashsize; i++) TAILQ_INIT(&nfsuserhash[i].lughead); for (i = 0; i < nfsrv_lughashsize; i++) TAILQ_INIT(&nfsusernamehash[i].lughead); for (i = 0; i < nfsrv_lughashsize; i++) TAILQ_INIT(&nfsgrouphash[i].lughead); for (i = 0; i < nfsrv_lughashsize; i++) TAILQ_INIT(&nfsgroupnamehash[i].lughead); /* * Put name in "DNS" string. */ nfsrv_dnsname = cp; nfsrv_defaultuid = nidp->nid_uid; nfsrv_defaultgid = nidp->nid_gid; nfsrv_usercnt = 0; nfsrv_usermax = nidp->nid_usermax; atomic_store_rel_int(&nfsrv_dnsnamelen, nidp->nid_namelen); goto out; } /* * malloc the new one now, so any potential sleep occurs before * manipulation of the lists. */ newusrp = malloc(sizeof(struct nfsusrgrp) + nidp->nid_namelen, M_NFSUSERGROUP, M_WAITOK | M_ZERO); error = copyin(nidp->nid_name, newusrp->lug_name, nidp->nid_namelen); if (error == 0 && nidp->nid_ngroup > 0 && (nidp->nid_flag & NFSID_ADDUID) != 0) { grps = malloc(sizeof(gid_t) * nidp->nid_ngroup, M_TEMP, M_WAITOK); error = copyin(nidp->nid_grps, grps, sizeof(gid_t) * nidp->nid_ngroup); if (error == 0) { /* * Create a credential just like svc_getcred(), * but using the group list provided. */ cr = crget(); cr->cr_uid = cr->cr_ruid = cr->cr_svuid = nidp->nid_uid; crsetgroups(cr, nidp->nid_ngroup, grps); cr->cr_rgid = cr->cr_svgid = cr->cr_groups[0]; cr->cr_prison = &prison0; prison_hold(cr->cr_prison); #ifdef MAC mac_cred_associate_nfsd(cr); #endif newusrp->lug_cred = cr; } free(grps, M_TEMP); } if (error) { free(newusrp, M_NFSUSERGROUP); goto out; } newusrp->lug_namelen = nidp->nid_namelen; /* * The lock order is username[0]->[nfsrv_lughashsize - 1] followed * by uid[0]->[nfsrv_lughashsize - 1], with the same for group. * The flags user_locked, username_locked, group_locked and * groupname_locked are set to indicate all of those hash lists are * locked. hp_name != NULL and hp_idnum != NULL indicates that * the respective one mutex is locked. */ user_locked = username_locked = group_locked = groupname_locked = 0; hp_name = hp_idnum = NULL; /* * Delete old entries, as required. */ if (nidp->nid_flag & (NFSID_DELUID | NFSID_ADDUID)) { /* Must lock all username hash lists first, to avoid a LOR. */ for (i = 0; i < nfsrv_lughashsize; i++) mtx_lock(&nfsusernamehash[i].mtx); username_locked = 1; hp_idnum = NFSUSERHASH(nidp->nid_uid); mtx_lock(&hp_idnum->mtx); TAILQ_FOREACH_SAFE(usrp, &hp_idnum->lughead, lug_numhash, nusrp) { if (usrp->lug_uid == nidp->nid_uid) nfsrv_removeuser(usrp, 1); } } else if (nidp->nid_flag & (NFSID_DELUSERNAME | NFSID_ADDUSERNAME)) { hp_name = NFSUSERNAMEHASH(newusrp->lug_name, newusrp->lug_namelen); mtx_lock(&hp_name->mtx); TAILQ_FOREACH_SAFE(usrp, &hp_name->lughead, lug_namehash, nusrp) { if (usrp->lug_namelen == newusrp->lug_namelen && !NFSBCMP(usrp->lug_name, newusrp->lug_name, usrp->lug_namelen)) { thp = NFSUSERHASH(usrp->lug_uid); mtx_lock(&thp->mtx); nfsrv_removeuser(usrp, 1); mtx_unlock(&thp->mtx); } } hp_idnum = NFSUSERHASH(nidp->nid_uid); mtx_lock(&hp_idnum->mtx); } else if (nidp->nid_flag & (NFSID_DELGID | NFSID_ADDGID)) { /* Must lock all groupname hash lists first, to avoid a LOR. */ for (i = 0; i < nfsrv_lughashsize; i++) mtx_lock(&nfsgroupnamehash[i].mtx); groupname_locked = 1; hp_idnum = NFSGROUPHASH(nidp->nid_gid); mtx_lock(&hp_idnum->mtx); TAILQ_FOREACH_SAFE(usrp, &hp_idnum->lughead, lug_numhash, nusrp) { if (usrp->lug_gid == nidp->nid_gid) nfsrv_removeuser(usrp, 0); } } else if (nidp->nid_flag & (NFSID_DELGROUPNAME | NFSID_ADDGROUPNAME)) { hp_name = NFSGROUPNAMEHASH(newusrp->lug_name, newusrp->lug_namelen); mtx_lock(&hp_name->mtx); TAILQ_FOREACH_SAFE(usrp, &hp_name->lughead, lug_namehash, nusrp) { if (usrp->lug_namelen == newusrp->lug_namelen && !NFSBCMP(usrp->lug_name, newusrp->lug_name, usrp->lug_namelen)) { thp = NFSGROUPHASH(usrp->lug_gid); mtx_lock(&thp->mtx); nfsrv_removeuser(usrp, 0); mtx_unlock(&thp->mtx); } } hp_idnum = NFSGROUPHASH(nidp->nid_gid); mtx_lock(&hp_idnum->mtx); } /* * Now, we can add the new one. */ if (nidp->nid_usertimeout) newusrp->lug_expiry = NFSD_MONOSEC + nidp->nid_usertimeout; else newusrp->lug_expiry = NFSD_MONOSEC + 5; if (nidp->nid_flag & (NFSID_ADDUID | NFSID_ADDUSERNAME)) { newusrp->lug_uid = nidp->nid_uid; thp = NFSUSERHASH(newusrp->lug_uid); mtx_assert(&thp->mtx, MA_OWNED); TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_numhash); thp = NFSUSERNAMEHASH(newusrp->lug_name, newusrp->lug_namelen); mtx_assert(&thp->mtx, MA_OWNED); TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_namehash); atomic_add_int(&nfsrv_usercnt, 1); } else if (nidp->nid_flag & (NFSID_ADDGID | NFSID_ADDGROUPNAME)) { newusrp->lug_gid = nidp->nid_gid; thp = NFSGROUPHASH(newusrp->lug_gid); mtx_assert(&thp->mtx, MA_OWNED); TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_numhash); thp = NFSGROUPNAMEHASH(newusrp->lug_name, newusrp->lug_namelen); mtx_assert(&thp->mtx, MA_OWNED); TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_namehash); atomic_add_int(&nfsrv_usercnt, 1); } else { if (newusrp->lug_cred != NULL) crfree(newusrp->lug_cred); free(newusrp, M_NFSUSERGROUP); } /* * Once per second, allow one thread to trim the cache. */ if (lasttime < NFSD_MONOSEC && atomic_cmpset_acq_int(&onethread, 0, 1) != 0) { /* * First, unlock the single mutexes, so that all entries * can be locked and any LOR is avoided. */ if (hp_name != NULL) { mtx_unlock(&hp_name->mtx); hp_name = NULL; } if (hp_idnum != NULL) { mtx_unlock(&hp_idnum->mtx); hp_idnum = NULL; } if ((nidp->nid_flag & (NFSID_DELUID | NFSID_ADDUID | NFSID_DELUSERNAME | NFSID_ADDUSERNAME)) != 0) { if (username_locked == 0) { for (i = 0; i < nfsrv_lughashsize; i++) mtx_lock(&nfsusernamehash[i].mtx); username_locked = 1; } KASSERT(user_locked == 0, ("nfssvc_idname: user_locked")); for (i = 0; i < nfsrv_lughashsize; i++) mtx_lock(&nfsuserhash[i].mtx); user_locked = 1; for (i = 0; i < nfsrv_lughashsize; i++) { TAILQ_FOREACH_SAFE(usrp, &nfsuserhash[i].lughead, lug_numhash, nusrp) if (usrp->lug_expiry < NFSD_MONOSEC) nfsrv_removeuser(usrp, 1); } for (i = 0; i < nfsrv_lughashsize; i++) { /* * Trim the cache using an approximate LRU * algorithm. This code deletes the least * recently used entry on each hash list. */ if (nfsrv_usercnt <= nfsrv_usermax) break; usrp = TAILQ_FIRST(&nfsuserhash[i].lughead); if (usrp != NULL) nfsrv_removeuser(usrp, 1); } } else { if (groupname_locked == 0) { for (i = 0; i < nfsrv_lughashsize; i++) mtx_lock(&nfsgroupnamehash[i].mtx); groupname_locked = 1; } KASSERT(group_locked == 0, ("nfssvc_idname: group_locked")); for (i = 0; i < nfsrv_lughashsize; i++) mtx_lock(&nfsgrouphash[i].mtx); group_locked = 1; for (i = 0; i < nfsrv_lughashsize; i++) { TAILQ_FOREACH_SAFE(usrp, &nfsgrouphash[i].lughead, lug_numhash, nusrp) if (usrp->lug_expiry < NFSD_MONOSEC) nfsrv_removeuser(usrp, 0); } for (i = 0; i < nfsrv_lughashsize; i++) { /* * Trim the cache using an approximate LRU * algorithm. This code deletes the least * recently user entry on each hash list. */ if (nfsrv_usercnt <= nfsrv_usermax) break; usrp = TAILQ_FIRST(&nfsgrouphash[i].lughead); if (usrp != NULL) nfsrv_removeuser(usrp, 0); } } lasttime = NFSD_MONOSEC; atomic_store_rel_int(&onethread, 0); } /* Now, unlock all locked mutexes. */ if (hp_idnum != NULL) mtx_unlock(&hp_idnum->mtx); if (hp_name != NULL) mtx_unlock(&hp_name->mtx); if (user_locked != 0) for (i = 0; i < nfsrv_lughashsize; i++) mtx_unlock(&nfsuserhash[i].mtx); if (username_locked != 0) for (i = 0; i < nfsrv_lughashsize; i++) mtx_unlock(&nfsusernamehash[i].mtx); if (group_locked != 0) for (i = 0; i < nfsrv_lughashsize; i++) mtx_unlock(&nfsgrouphash[i].mtx); if (groupname_locked != 0) for (i = 0; i < nfsrv_lughashsize; i++) mtx_unlock(&nfsgroupnamehash[i].mtx); out: NFSEXITCODE(error); return (error); } /* * Remove a user/group name element. */ static void nfsrv_removeuser(struct nfsusrgrp *usrp, int isuser) { struct nfsrv_lughash *hp; if (isuser != 0) { hp = NFSUSERHASH(usrp->lug_uid); mtx_assert(&hp->mtx, MA_OWNED); TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash); hp = NFSUSERNAMEHASH(usrp->lug_name, usrp->lug_namelen); mtx_assert(&hp->mtx, MA_OWNED); TAILQ_REMOVE(&hp->lughead, usrp, lug_namehash); } else { hp = NFSGROUPHASH(usrp->lug_gid); mtx_assert(&hp->mtx, MA_OWNED); TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash); hp = NFSGROUPNAMEHASH(usrp->lug_name, usrp->lug_namelen); mtx_assert(&hp->mtx, MA_OWNED); TAILQ_REMOVE(&hp->lughead, usrp, lug_namehash); } atomic_add_int(&nfsrv_usercnt, -1); if (usrp->lug_cred != NULL) crfree(usrp->lug_cred); free(usrp, M_NFSUSERGROUP); } /* * Free up all the allocations related to the name<-->id cache. * This function should only be called when the nfsuserd daemon isn't * running, since it doesn't do any locking. * This function is meant to be used when the nfscommon module is unloaded. */ void nfsrv_cleanusergroup(void) { struct nfsrv_lughash *hp, *hp2; struct nfsusrgrp *nusrp, *usrp; int i; if (nfsuserhash == NULL) return; for (i = 0; i < nfsrv_lughashsize; i++) { hp = &nfsuserhash[i]; TAILQ_FOREACH_SAFE(usrp, &hp->lughead, lug_numhash, nusrp) { TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash); hp2 = NFSUSERNAMEHASH(usrp->lug_name, usrp->lug_namelen); TAILQ_REMOVE(&hp2->lughead, usrp, lug_namehash); if (usrp->lug_cred != NULL) crfree(usrp->lug_cred); free(usrp, M_NFSUSERGROUP); } hp = &nfsgrouphash[i]; TAILQ_FOREACH_SAFE(usrp, &hp->lughead, lug_numhash, nusrp) { TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash); hp2 = NFSGROUPNAMEHASH(usrp->lug_name, usrp->lug_namelen); TAILQ_REMOVE(&hp2->lughead, usrp, lug_namehash); if (usrp->lug_cred != NULL) crfree(usrp->lug_cred); free(usrp, M_NFSUSERGROUP); } mtx_destroy(&nfsuserhash[i].mtx); mtx_destroy(&nfsusernamehash[i].mtx); mtx_destroy(&nfsgroupnamehash[i].mtx); mtx_destroy(&nfsgrouphash[i].mtx); } free(nfsuserhash, M_NFSUSERGROUP); free(nfsusernamehash, M_NFSUSERGROUP); free(nfsgrouphash, M_NFSUSERGROUP); free(nfsgroupnamehash, M_NFSUSERGROUP); free(nfsrv_dnsname, M_NFSSTRING); } /* * This function scans a byte string and checks for UTF-8 compliance. * It returns 0 if it conforms and NFSERR_INVAL if not. */ int nfsrv_checkutf8(u_int8_t *cp, int len) { u_int32_t val = 0x0; int cnt = 0, gotd = 0, shift = 0; u_int8_t byte; static int utf8_shift[5] = { 7, 11, 16, 21, 26 }; int error = 0; /* * Here are what the variables are used for: * val - the calculated value of a multibyte char, used to check * that it was coded with the correct range * cnt - the number of 10xxxxxx bytes to follow * gotd - set for a char of Dxxx, so D800<->DFFF can be checked for * shift - lower order bits of range (ie. "val >> shift" should * not be 0, in other words, dividing by the lower bound * of the range should get a non-zero value) * byte - used to calculate cnt */ while (len > 0) { if (cnt > 0) { /* This handles the 10xxxxxx bytes */ if ((*cp & 0xc0) != 0x80 || (gotd && (*cp & 0x20))) { error = NFSERR_INVAL; goto out; } gotd = 0; val <<= 6; val |= (*cp & 0x3f); cnt--; if (cnt == 0 && (val >> shift) == 0x0) { error = NFSERR_INVAL; goto out; } } else if (*cp & 0x80) { /* first byte of multi byte char */ byte = *cp; while ((byte & 0x40) && cnt < 6) { cnt++; byte <<= 1; } if (cnt == 0 || cnt == 6) { error = NFSERR_INVAL; goto out; } val = (*cp & (0x3f >> cnt)); shift = utf8_shift[cnt - 1]; if (cnt == 2 && val == 0xd) /* Check for the 0xd800-0xdfff case */ gotd = 1; } cp++; len--; } if (cnt > 0) error = NFSERR_INVAL; out: NFSEXITCODE(error); return (error); } /* * Parse the xdr for an NFSv4 FsLocations attribute. Return two malloc'd * strings, one with the root path in it and the other with the list of * locations. The list is in the same format as is found in nfr_refs. * It is a "," separated list of entries, where each of them is of the * form :. For example * "nfsv4-test:/sub2,nfsv4-test2:/user/mnt,nfsv4-test2:/user/mnt2" * The nilp argument is set to 1 for the special case of a null fs_root * and an empty server list. * It returns NFSERR_BADXDR, if the xdr can't be parsed and returns the * number of xdr bytes parsed in sump. */ static int nfsrv_getrefstr(struct nfsrv_descript *nd, u_char **fsrootp, u_char **srvp, int *sump, int *nilp) { u_int32_t *tl; u_char *cp = NULL, *cp2 = NULL, *cp3, *str; int i, j, len, stringlen, cnt, slen, siz, xdrsum, error = 0, nsrv; struct list { SLIST_ENTRY(list) next; int len; u_char host[1]; } *lsp, *nlsp; SLIST_HEAD(, list) head; *fsrootp = NULL; *srvp = NULL; *nilp = 0; /* * Get the fs_root path and check for the special case of null path * and 0 length server list. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len < 0 || len > 10240) { error = NFSERR_BADXDR; goto nfsmout; } if (len == 0) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (*tl != 0) { error = NFSERR_BADXDR; goto nfsmout; } *nilp = 1; *sump = 2 * NFSX_UNSIGNED; error = 0; goto nfsmout; } cp = malloc(len + 1, M_NFSSTRING, M_WAITOK); error = nfsrv_mtostr(nd, cp, len); if (!error) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); cnt = fxdr_unsigned(int, *tl); if (cnt <= 0) error = NFSERR_BADXDR; } if (error) goto nfsmout; /* * Now, loop through the location list and make up the srvlist. */ xdrsum = (2 * NFSX_UNSIGNED) + NFSM_RNDUP(len); cp2 = cp3 = malloc(1024, M_NFSSTRING, M_WAITOK); slen = 1024; siz = 0; for (i = 0; i < cnt; i++) { SLIST_INIT(&head); NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nsrv = fxdr_unsigned(int, *tl); if (nsrv <= 0) { error = NFSERR_BADXDR; goto nfsmout; } /* * Handle the first server by putting it in the srvstr. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len <= 0 || len > 1024) { error = NFSERR_BADXDR; goto nfsmout; } nfsrv_refstrbigenough(siz + len + 3, &cp2, &cp3, &slen); if (cp3 != cp2) { *cp3++ = ','; siz++; } error = nfsrv_mtostr(nd, cp3, len); if (error) goto nfsmout; cp3 += len; *cp3++ = ':'; siz += (len + 1); xdrsum += (2 * NFSX_UNSIGNED) + NFSM_RNDUP(len); for (j = 1; j < nsrv; j++) { /* * Yuck, put them in an slist and process them later. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len <= 0 || len > 1024) { error = NFSERR_BADXDR; goto nfsmout; } lsp = (struct list *)malloc(sizeof (struct list) + len, M_TEMP, M_WAITOK); error = nfsrv_mtostr(nd, lsp->host, len); if (error) goto nfsmout; xdrsum += NFSX_UNSIGNED + NFSM_RNDUP(len); lsp->len = len; SLIST_INSERT_HEAD(&head, lsp, next); } /* * Finally, we can get the path. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len <= 0 || len > 1024) { error = NFSERR_BADXDR; goto nfsmout; } nfsrv_refstrbigenough(siz + len + 1, &cp2, &cp3, &slen); error = nfsrv_mtostr(nd, cp3, len); if (error) goto nfsmout; xdrsum += NFSX_UNSIGNED + NFSM_RNDUP(len); str = cp3; stringlen = len; cp3 += len; siz += len; SLIST_FOREACH_SAFE(lsp, &head, next, nlsp) { nfsrv_refstrbigenough(siz + lsp->len + stringlen + 3, &cp2, &cp3, &slen); *cp3++ = ','; NFSBCOPY(lsp->host, cp3, lsp->len); cp3 += lsp->len; *cp3++ = ':'; NFSBCOPY(str, cp3, stringlen); cp3 += stringlen; *cp3 = '\0'; siz += (lsp->len + stringlen + 2); free(lsp, M_TEMP); } } *fsrootp = cp; *srvp = cp2; *sump = xdrsum; NFSEXITCODE2(0, nd); return (0); nfsmout: if (cp != NULL) free(cp, M_NFSSTRING); if (cp2 != NULL) free(cp2, M_NFSSTRING); NFSEXITCODE2(error, nd); return (error); } /* * Make the malloc'd space large enough. This is a pain, but the xdr * doesn't set an upper bound on the side, so... */ static void nfsrv_refstrbigenough(int siz, u_char **cpp, u_char **cpp2, int *slenp) { u_char *cp; int i; if (siz <= *slenp) return; cp = malloc(siz + 1024, M_NFSSTRING, M_WAITOK); NFSBCOPY(*cpp, cp, *slenp); free(*cpp, M_NFSSTRING); i = *cpp2 - *cpp; *cpp = cp; *cpp2 = cp + i; *slenp = siz + 1024; } /* * Initialize the reply header data structures. */ void nfsrvd_rephead(struct nfsrv_descript *nd) { struct mbuf *mreq; if ((nd->nd_flag & ND_EXTPG) != 0) { mreq = mb_alloc_ext_plus_pages(PAGE_SIZE, M_WAITOK); nd->nd_mreq = nd->nd_mb = mreq; nd->nd_bpos = (char *)(void *) PHYS_TO_DMAP(mreq->m_epg_pa[0]); nd->nd_bextpg = 0; nd->nd_bextpgsiz = PAGE_SIZE; } else { /* * If this is a big reply, use a cluster. */ if ((nd->nd_flag & ND_GSSINITREPLY) == 0 && nfs_bigreply[nd->nd_procnum]) { NFSMCLGET(mreq, M_WAITOK); nd->nd_mreq = mreq; nd->nd_mb = mreq; } else { NFSMGET(mreq); nd->nd_mreq = mreq; nd->nd_mb = mreq; } nd->nd_bpos = mtod(mreq, char *); mreq->m_len = 0; } if ((nd->nd_flag & ND_GSSINITREPLY) == 0) NFSM_BUILD(nd->nd_errp, int *, NFSX_UNSIGNED); } /* * Lock a socket against others. * Currently used to serialize connect/disconnect attempts. */ int newnfs_sndlock(int *flagp) { struct timespec ts; NFSLOCKSOCK(); while (*flagp & NFSR_SNDLOCK) { *flagp |= NFSR_WANTSND; ts.tv_sec = 0; ts.tv_nsec = 0; (void) nfsmsleep((caddr_t)flagp, NFSSOCKMUTEXPTR, PZERO - 1, "nfsndlck", &ts); } *flagp |= NFSR_SNDLOCK; NFSUNLOCKSOCK(); return (0); } /* * Unlock the stream socket for others. */ void newnfs_sndunlock(int *flagp) { NFSLOCKSOCK(); if ((*flagp & NFSR_SNDLOCK) == 0) panic("nfs sndunlock"); *flagp &= ~NFSR_SNDLOCK; if (*flagp & NFSR_WANTSND) { *flagp &= ~NFSR_WANTSND; wakeup((caddr_t)flagp); } NFSUNLOCKSOCK(); } int nfsv4_getipaddr(struct nfsrv_descript *nd, struct sockaddr_in *sin, struct sockaddr_in6 *sin6, sa_family_t *saf, int *isudp) { struct in_addr saddr; uint32_t portnum, *tl; int i, j, k; sa_family_t af = AF_UNSPEC; char addr[64], protocol[5], *cp; int cantparse = 0, error = 0; uint16_t portv; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); if (i >= 3 && i <= 4) { error = nfsrv_mtostr(nd, protocol, i); if (error) goto nfsmout; if (strcmp(protocol, "tcp") == 0) { af = AF_INET; *isudp = 0; } else if (strcmp(protocol, "udp") == 0) { af = AF_INET; *isudp = 1; } else if (strcmp(protocol, "tcp6") == 0) { af = AF_INET6; *isudp = 0; } else if (strcmp(protocol, "udp6") == 0) { af = AF_INET6; *isudp = 1; } else cantparse = 1; } else { cantparse = 1; if (i > 0) { error = nfsm_advance(nd, NFSM_RNDUP(i), -1); if (error) goto nfsmout; } } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); if (i < 0) { error = NFSERR_BADXDR; goto nfsmout; } else if (cantparse == 0 && i >= 11 && i < 64) { /* * The shortest address is 11chars and the longest is < 64. */ error = nfsrv_mtostr(nd, addr, i); if (error) goto nfsmout; /* Find the port# at the end and extract that. */ i = strlen(addr); k = 0; cp = &addr[i - 1]; /* Count back two '.'s from end to get port# field. */ for (j = 0; j < i; j++) { if (*cp == '.') { k++; if (k == 2) break; } cp--; } if (k == 2) { /* * The NFSv4 port# is appended as .N.N, where N is * a decimal # in the range 0-255, just like an inet4 * address. Cheat and use inet_aton(), which will * return a Class A address and then shift the high * order 8bits over to convert it to the port#. */ *cp++ = '\0'; if (inet_aton(cp, &saddr) == 1) { portnum = ntohl(saddr.s_addr); portv = (uint16_t)((portnum >> 16) | (portnum & 0xff)); } else cantparse = 1; } else cantparse = 1; if (cantparse == 0) { if (af == AF_INET) { if (inet_pton(af, addr, &sin->sin_addr) == 1) { sin->sin_len = sizeof(*sin); sin->sin_family = AF_INET; sin->sin_port = htons(portv); *saf = af; return (0); } } else { if (inet_pton(af, addr, &sin6->sin6_addr) == 1) { sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_port = htons(portv); *saf = af; return (0); } } } } else { if (i > 0) { error = nfsm_advance(nd, NFSM_RNDUP(i), -1); if (error) goto nfsmout; } } error = EPERM; nfsmout: return (error); } /* * Handle an NFSv4.1 Sequence request for the session. * If reply != NULL, use it to return the cached reply, as required. * The client gets a cached reply via this call for callbacks, however the * server gets a cached reply via the nfsv4_seqsess_cacherep() call. */ int nfsv4_seqsession(uint32_t seqid, uint32_t slotid, uint32_t highslot, struct nfsslot *slots, struct mbuf **reply, uint16_t maxslot) { struct mbuf *m; int error; error = 0; if (reply != NULL) *reply = NULL; if (slotid > maxslot) return (NFSERR_BADSLOT); if (seqid == slots[slotid].nfssl_seq) { /* A retry. */ if (slots[slotid].nfssl_inprog != 0) error = NFSERR_DELAY; else if (slots[slotid].nfssl_reply != NULL) { if (reply != NULL) { m = m_copym(slots[slotid].nfssl_reply, 0, M_COPYALL, M_NOWAIT); if (m != NULL) *reply = m; else { *reply = slots[slotid].nfssl_reply; slots[slotid].nfssl_reply = NULL; } } slots[slotid].nfssl_inprog = 1; error = NFSERR_REPLYFROMCACHE; } else /* No reply cached, so just do it. */ slots[slotid].nfssl_inprog = 1; } else if ((slots[slotid].nfssl_seq + 1) == seqid) { if (slots[slotid].nfssl_reply != NULL) m_freem(slots[slotid].nfssl_reply); slots[slotid].nfssl_reply = NULL; slots[slotid].nfssl_inprog = 1; slots[slotid].nfssl_seq++; } else error = NFSERR_SEQMISORDERED; return (error); } /* * Cache this reply for the slot. * Use the "rep" argument to return the cached reply if repstat is set to * NFSERR_REPLYFROMCACHE. The client never sets repstat to this value. */ void nfsv4_seqsess_cacherep(uint32_t slotid, struct nfsslot *slots, int repstat, struct mbuf **rep) { struct mbuf *m; if (repstat == NFSERR_REPLYFROMCACHE) { if (slots[slotid].nfssl_reply != NULL) { /* * We cannot sleep here, but copy will usually * succeed. */ m = m_copym(slots[slotid].nfssl_reply, 0, M_COPYALL, M_NOWAIT); if (m != NULL) *rep = m; else { /* * Multiple retries would be extremely rare, * so using the cached reply will likely * be ok. */ *rep = slots[slotid].nfssl_reply; slots[slotid].nfssl_reply = NULL; } } else *rep = NULL; } else { if (slots[slotid].nfssl_reply != NULL) m_freem(slots[slotid].nfssl_reply); slots[slotid].nfssl_reply = *rep; } slots[slotid].nfssl_inprog = 0; } /* * Generate the xdr for an NFSv4.1 Sequence Operation. */ void nfsv4_setsequence(struct nfsmount *nmp, struct nfsrv_descript *nd, struct nfsclsession *sep, int dont_replycache) { uint32_t *tl, slotseq = 0; int error, maxslot, slotpos; uint8_t sessionid[NFSX_V4SESSIONID]; error = nfsv4_sequencelookup(nmp, sep, &slotpos, &maxslot, &slotseq, sessionid); nd->nd_maxreq = sep->nfsess_maxreq; nd->nd_maxresp = sep->nfsess_maxresp; /* Build the Sequence arguments. */ NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID + 4 * NFSX_UNSIGNED); nd->nd_sequence = tl; bcopy(sessionid, tl, NFSX_V4SESSIONID); tl += NFSX_V4SESSIONID / NFSX_UNSIGNED; nd->nd_slotseq = tl; if (error == 0) { nd->nd_flag |= ND_HASSLOTID; nd->nd_slotid = slotpos; *tl++ = txdr_unsigned(slotseq); *tl++ = txdr_unsigned(slotpos); *tl++ = txdr_unsigned(maxslot); if (dont_replycache == 0) *tl = newnfs_true; else *tl = newnfs_false; } else { /* * There are two errors and the rest of the session can * just be zeros. * NFSERR_BADSESSION: This bad session should just generate * the same error again when the RPC is retried. * ESTALE: A forced dismount is in progress and will cause the * RPC to fail later. */ *tl++ = 0; *tl++ = 0; *tl++ = 0; *tl = 0; } nd->nd_flag |= ND_HASSEQUENCE; } int nfsv4_sequencelookup(struct nfsmount *nmp, struct nfsclsession *sep, int *slotposp, int *maxslotp, uint32_t *slotseqp, uint8_t *sessionid) { int i, maxslot, slotpos; uint64_t bitval; /* Find an unused slot. */ slotpos = -1; maxslot = -1; mtx_lock(&sep->nfsess_mtx); do { if (nmp != NULL && sep->nfsess_defunct != 0) { /* Just return the bad session. */ bcopy(sep->nfsess_sessionid, sessionid, NFSX_V4SESSIONID); mtx_unlock(&sep->nfsess_mtx); return (NFSERR_BADSESSION); } bitval = 1; for (i = 0; i < sep->nfsess_foreslots; i++) { if ((bitval & sep->nfsess_slots) == 0) { slotpos = i; sep->nfsess_slots |= bitval; sep->nfsess_slotseq[i]++; *slotseqp = sep->nfsess_slotseq[i]; break; } bitval <<= 1; } if (slotpos == -1) { /* * If a forced dismount is in progress, just return. * This RPC attempt will fail when it calls * newnfs_request(). */ if (nmp != NULL && NFSCL_FORCEDISM(nmp->nm_mountp)) { mtx_unlock(&sep->nfsess_mtx); return (ESTALE); } /* Wake up once/sec, to check for a forced dismount. */ (void)mtx_sleep(&sep->nfsess_slots, &sep->nfsess_mtx, PZERO, "nfsclseq", hz); } } while (slotpos == -1); /* Now, find the highest slot in use. (nfsc_slots is 64bits) */ bitval = 1; for (i = 0; i < 64; i++) { if ((bitval & sep->nfsess_slots) != 0) maxslot = i; bitval <<= 1; } bcopy(sep->nfsess_sessionid, sessionid, NFSX_V4SESSIONID); mtx_unlock(&sep->nfsess_mtx); *slotposp = slotpos; *maxslotp = maxslot; return (0); } /* * Free a session slot. */ void nfsv4_freeslot(struct nfsclsession *sep, int slot, bool resetseq) { uint64_t bitval; bitval = 1; if (slot > 0) bitval <<= slot; mtx_lock(&sep->nfsess_mtx); if (resetseq) sep->nfsess_slotseq[slot]--; if ((bitval & sep->nfsess_slots) == 0) printf("freeing free slot!!\n"); sep->nfsess_slots &= ~bitval; wakeup(&sep->nfsess_slots); mtx_unlock(&sep->nfsess_mtx); } /* * Search for a matching pnfsd DS, based on the nmp arg. * Return one if found, NULL otherwise. */ struct nfsdevice * nfsv4_findmirror(struct nfsmount *nmp) { struct nfsdevice *ds; mtx_assert(NFSDDSMUTEXPTR, MA_OWNED); /* * Search the DS server list for a match with nmp. */ if (nfsrv_devidcnt == 0) return (NULL); TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) { if (ds->nfsdev_nmp == nmp) { NFSCL_DEBUG(4, "nfsv4_findmirror: fnd main ds\n"); break; } } return (ds); } /* * Fill in the fields of "struct nfsrv_descript". */ void nfsm_set(struct nfsrv_descript *nd, u_int offs) { struct mbuf *m; int rlen; m = nd->nd_mb; if ((m->m_flags & M_EXTPG) != 0) { nd->nd_bextpg = 0; while (offs > 0) { if (nd->nd_bextpg == 0) rlen = m_epg_pagelen(m, 0, m->m_epg_1st_off); else rlen = m_epg_pagelen(m, nd->nd_bextpg, 0); if (offs <= rlen) break; offs -= rlen; nd->nd_bextpg++; if (nd->nd_bextpg == m->m_epg_npgs) { printf("nfsm_set: build offs " "out of range\n"); nd->nd_bextpg--; break; } } nd->nd_bpos = (char *)(void *) PHYS_TO_DMAP(m->m_epg_pa[nd->nd_bextpg]); if (nd->nd_bextpg == 0) nd->nd_bpos += m->m_epg_1st_off; if (offs > 0) { nd->nd_bpos += offs; nd->nd_bextpgsiz = rlen - offs; } else if (nd->nd_bextpg == 0) nd->nd_bextpgsiz = PAGE_SIZE - m->m_epg_1st_off; else nd->nd_bextpgsiz = PAGE_SIZE; } else nd->nd_bpos = mtod(m, char *) + offs; } /* * Grow a ext_pgs mbuf list. Either allocate another page or add * an mbuf to the list. */ struct mbuf * nfsm_add_ext_pgs(struct mbuf *m, int maxextsiz, int *bextpg) { struct mbuf *mp; vm_page_t pg; if ((m->m_epg_npgs + 1) * PAGE_SIZE > maxextsiz) { mp = mb_alloc_ext_plus_pages(PAGE_SIZE, M_WAITOK); *bextpg = 0; m->m_next = mp; } else { - do { - pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | - VM_ALLOC_WIRED); - if (pg == NULL) - vm_wait(NULL); - } while (pg == NULL); + pg = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_NODUMP | + VM_ALLOC_WIRED); m->m_epg_pa[m->m_epg_npgs] = VM_PAGE_TO_PHYS(pg); *bextpg = m->m_epg_npgs; m->m_epg_npgs++; m->m_epg_last_len = 0; mp = m; } return (mp); } diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index fa0df1c37261..35ce9be88405 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -1,9039 +1,9034 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); /* * Rpc op calls, generally called from the vnode op calls or through the * buffer cache, for NFS v2, 3 and 4. * These do not normally make any changes to vnode arguments or use * structures that might change between the VFS variants. The returned * arguments are all at the end, after the NFSPROC_T *p one. */ #include "opt_inet6.h" #include #include #include #include #include SYSCTL_DECL(_vfs_nfs); static int nfsignore_eexist = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, ignore_eexist, CTLFLAG_RW, &nfsignore_eexist, 0, "NFS ignore EEXIST replies for mkdir/symlink"); static int nfscl_dssameconn = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, dssameconn, CTLFLAG_RW, &nfscl_dssameconn, 0, "Use same TCP connection to multiple DSs"); static uint64_t nfs_maxcopyrange = SSIZE_MAX; SYSCTL_U64(_vfs_nfs, OID_AUTO, maxcopyrange, CTLFLAG_RW, &nfs_maxcopyrange, 0, "Max size of a Copy so RPC times reasonable"); /* * Global variables */ extern struct nfsstatsv1 nfsstatsv1; extern int nfs_numnfscbd; extern struct timeval nfsboottime; extern u_int32_t newnfs_false, newnfs_true; extern nfstype nfsv34_type[9]; extern int nfsrv_useacl; extern char nfsv4_callbackaddr[INET6_ADDRSTRLEN]; extern int nfscl_debuglevel; extern int nfs_pnfsiothreads; extern u_long sb_max_adj; NFSCLSTATEMUTEX; int nfstest_outofseq = 0; int nfscl_assumeposixlocks = 1; int nfscl_enablecallb = 0; short nfsv4_cbport = NFSV4_CBPORT; int nfstest_openallsetattr = 0; #define DIRHDSIZ offsetof(struct dirent, d_name) /* * nfscl_getsameserver() can return one of three values: * NFSDSP_USETHISSESSION - Use this session for the DS. * NFSDSP_SEQTHISSESSION - Use the nfsclds_sequence field of this dsp for new * session. * NFSDSP_NOTFOUND - No matching server was found. */ enum nfsclds_state { NFSDSP_USETHISSESSION = 0, NFSDSP_SEQTHISSESSION = 1, NFSDSP_NOTFOUND = 2, }; /* * Do a write RPC on a DS data file, using this structure for the arguments, * so that this function can be executed by a separate kernel process. */ struct nfsclwritedsdorpc { int done; int inprog; struct task tsk; struct vnode *vp; int iomode; int must_commit; nfsv4stateid_t *stateidp; struct nfsclds *dsp; uint64_t off; int len; #ifdef notyet int advise; #endif struct nfsfh *fhp; struct mbuf *m; int vers; int minorvers; struct ucred *cred; NFSPROC_T *p; int err; }; static int nfsrpc_setattrrpc(vnode_t , struct vattr *, nfsv4stateid_t *, struct ucred *, NFSPROC_T *, struct nfsvattr *, int *, void *); static int nfsrpc_readrpc(vnode_t , struct uio *, struct ucred *, nfsv4stateid_t *, NFSPROC_T *, struct nfsvattr *, int *, void *); static int nfsrpc_writerpc(vnode_t , struct uio *, int *, int *, struct ucred *, nfsv4stateid_t *, NFSPROC_T *, struct nfsvattr *, int *, void *); static int nfsrpc_deallocaterpc(vnode_t, off_t, off_t, nfsv4stateid_t *, struct nfsvattr *, int *, struct ucred *, NFSPROC_T *, void *); static int nfsrpc_createv23(vnode_t , char *, int, struct vattr *, nfsquad_t, int, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *, int *, void *); static int nfsrpc_createv4(vnode_t , char *, int, struct vattr *, nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *, int *, void *, int *); static int nfsrpc_locku(struct nfsrv_descript *, struct nfsmount *, struct nfscllockowner *, u_int64_t, u_int64_t, u_int32_t, struct ucred *, NFSPROC_T *, int); static int nfsrpc_setaclrpc(vnode_t, struct ucred *, NFSPROC_T *, struct acl *, nfsv4stateid_t *, void *); static int nfsrpc_getlayout(struct nfsmount *, vnode_t, struct nfsfh *, int, uint32_t, uint32_t *, nfsv4stateid_t *, uint64_t, struct nfscllayout **, struct ucred *, NFSPROC_T *); static int nfsrpc_fillsa(struct nfsmount *, struct sockaddr_in *, struct sockaddr_in6 *, sa_family_t, int, int, struct nfsclds **, NFSPROC_T *); static void nfscl_initsessionslots(struct nfsclsession *); static int nfscl_doflayoutio(vnode_t, struct uio *, int *, int *, int *, nfsv4stateid_t *, int, struct nfscldevinfo *, struct nfscllayout *, struct nfsclflayout *, uint64_t, uint64_t, int, struct ucred *, NFSPROC_T *); static int nfscl_dofflayoutio(vnode_t, struct uio *, int *, int *, int *, nfsv4stateid_t *, int, struct nfscldevinfo *, struct nfscllayout *, struct nfsclflayout *, uint64_t, uint64_t, int, int, struct mbuf *, struct nfsclwritedsdorpc *, struct ucred *, NFSPROC_T *); static int nfsrpc_readds(vnode_t, struct uio *, nfsv4stateid_t *, int *, struct nfsclds *, uint64_t, int, struct nfsfh *, int, int, int, struct ucred *, NFSPROC_T *); static int nfsrpc_writeds(vnode_t, struct uio *, int *, int *, nfsv4stateid_t *, struct nfsclds *, uint64_t, int, struct nfsfh *, int, int, int, int, struct ucred *, NFSPROC_T *); static int nfsio_writedsmir(vnode_t, int *, int *, nfsv4stateid_t *, struct nfsclds *, uint64_t, int, struct nfsfh *, struct mbuf *, int, int, struct nfsclwritedsdorpc *, struct ucred *, NFSPROC_T *); static int nfsrpc_writedsmir(vnode_t, int *, int *, nfsv4stateid_t *, struct nfsclds *, uint64_t, int, struct nfsfh *, struct mbuf *, int, int, struct ucred *, NFSPROC_T *); static enum nfsclds_state nfscl_getsameserver(struct nfsmount *, struct nfsclds *, struct nfsclds **, uint32_t *); static int nfsio_commitds(vnode_t, uint64_t, int, struct nfsclds *, struct nfsfh *, int, int, struct nfsclwritedsdorpc *, struct ucred *, NFSPROC_T *); static int nfsrpc_commitds(vnode_t, uint64_t, int, struct nfsclds *, struct nfsfh *, int, int, struct ucred *, NFSPROC_T *); #ifdef notyet static int nfsio_adviseds(vnode_t, uint64_t, int, int, struct nfsclds *, struct nfsfh *, int, int, struct nfsclwritedsdorpc *, struct ucred *, NFSPROC_T *); static int nfsrpc_adviseds(vnode_t, uint64_t, int, int, struct nfsclds *, struct nfsfh *, int, int, struct ucred *, NFSPROC_T *); #endif static int nfsrpc_allocaterpc(vnode_t, off_t, off_t, nfsv4stateid_t *, struct nfsvattr *, int *, struct ucred *, NFSPROC_T *, void *); static void nfsrv_setuplayoutget(struct nfsrv_descript *, int, uint64_t, uint64_t, uint64_t, nfsv4stateid_t *, int, int, int); static int nfsrv_parseug(struct nfsrv_descript *, int, uid_t *, gid_t *, NFSPROC_T *); static int nfsrv_parselayoutget(struct nfsmount *, struct nfsrv_descript *, nfsv4stateid_t *, int *, struct nfsclflayouthead *); static int nfsrpc_getopenlayout(struct nfsmount *, vnode_t, u_int8_t *, int, uint8_t *, int, uint32_t, struct nfsclopen *, uint8_t *, int, struct nfscldeleg **, struct ucred *, NFSPROC_T *); static int nfsrpc_getcreatelayout(vnode_t, char *, int, struct vattr *, nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *, int *, void *, int *); static int nfsrpc_openlayoutrpc(struct nfsmount *, vnode_t, u_int8_t *, int, uint8_t *, int, uint32_t, struct nfsclopen *, uint8_t *, int, struct nfscldeleg **, nfsv4stateid_t *, int, int, int, int *, struct nfsclflayouthead *, int *, struct ucred *, NFSPROC_T *); static int nfsrpc_createlayout(vnode_t, char *, int, struct vattr *, nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *, int *, void *, int *, nfsv4stateid_t *, int, int, int, int *, struct nfsclflayouthead *, int *); static int nfsrpc_layoutget(struct nfsmount *, uint8_t *, int, int, uint64_t, uint64_t, uint64_t, int, int, nfsv4stateid_t *, int *, struct nfsclflayouthead *, struct ucred *, NFSPROC_T *, void *); static int nfsrpc_layoutgetres(struct nfsmount *, vnode_t, uint8_t *, int, nfsv4stateid_t *, int, uint32_t *, struct nfscllayout **, struct nfsclflayouthead *, int, int, int *, struct ucred *, NFSPROC_T *); static int nfsrpc_copyrpc(vnode_t, off_t, vnode_t, off_t, size_t *, nfsv4stateid_t *, nfsv4stateid_t *, struct nfsvattr *, int *, struct nfsvattr *, int *, bool, int *, struct ucred *, NFSPROC_T *); static int nfsrpc_seekrpc(vnode_t, off_t *, nfsv4stateid_t *, bool *, int, struct nfsvattr *, int *, struct ucred *); static struct mbuf *nfsm_split(struct mbuf *, uint64_t); int nfs_pnfsio(task_fn_t *, void *); /* * nfs null call from vfs. */ int nfsrpc_null(vnode_t vp, struct ucred *cred, NFSPROC_T *p) { int error; struct nfsrv_descript nfsd, *nd = &nfsd; NFSCL_REQSTART(nd, NFSPROC_NULL, vp); error = nfscl_request(nd, vp, p, cred, NULL); if (nd->nd_repstat && !error) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * nfs access rpc op. * For nfs version 3 and 4, use the access rpc to check accessibility. If file * modes are changed on the server, accesses might still fail later. */ int nfsrpc_access(vnode_t vp, int acmode, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp) { int error; u_int32_t mode, rmode; if (acmode & VREAD) mode = NFSACCESS_READ; else mode = 0; if (vnode_vtype(vp) == VDIR) { if (acmode & VWRITE) mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND | NFSACCESS_DELETE); if (acmode & VEXEC) mode |= NFSACCESS_LOOKUP; } else { if (acmode & VWRITE) mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND); if (acmode & VEXEC) mode |= NFSACCESS_EXECUTE; } /* * Now, just call nfsrpc_accessrpc() to do the actual RPC. */ error = nfsrpc_accessrpc(vp, mode, cred, p, nap, attrflagp, &rmode, NULL); /* * The NFS V3 spec does not clarify whether or not * the returned access bits can be a superset of * the ones requested, so... */ if (!error && (rmode & mode) != mode) error = EACCES; return (error); } /* * The actual rpc, separated out for Darwin. */ int nfsrpc_accessrpc(vnode_t vp, u_int32_t mode, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, u_int32_t *rmodep, void *stuff) { u_int32_t *tl; u_int32_t supported, rmode; int error; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; supported = mode; NFSCL_REQSTART(nd, NFSPROC_ACCESS, vp); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(mode); if (nd->nd_flag & ND_NFSV4) { /* * And do a Getattr op. */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) { error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (error) goto nfsmout; } if (!nd->nd_repstat) { if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); supported = fxdr_unsigned(u_int32_t, *tl++); } else { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); } rmode = fxdr_unsigned(u_int32_t, *tl); if (nd->nd_flag & ND_NFSV4) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); /* * It's not obvious what should be done about * unsupported access modes. For now, be paranoid * and clear the unsupported ones. */ rmode &= supported; *rmodep = rmode; } else error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs open rpc */ int nfsrpc_open(vnode_t vp, int amode, struct ucred *cred, NFSPROC_T *p) { struct nfsclopen *op; struct nfscldeleg *dp; struct nfsfh *nfhp; struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); u_int32_t mode, clidrev; int ret, newone, error, expireret = 0, retrycnt; /* * For NFSv4, Open Ops are only done on Regular Files. */ if (vnode_vtype(vp) != VREG) return (0); mode = 0; if (amode & FREAD) mode |= NFSV4OPEN_ACCESSREAD; if (amode & FWRITE) mode |= NFSV4OPEN_ACCESSWRITE; nfhp = np->n_fhp; retrycnt = 0; #ifdef notdef { char name[100]; int namel; namel = (np->n_v4->n4_namelen < 100) ? np->n_v4->n4_namelen : 99; bcopy(NFS4NODENAME(np->n_v4), name, namel); name[namel] = '\0'; printf("rpcopen p=0x%x name=%s",p->p_pid,name); if (nfhp->nfh_len > 0) printf(" fh=0x%x\n",nfhp->nfh_fh[12]); else printf(" fhl=0\n"); } #endif do { dp = NULL; error = nfscl_open(vp, nfhp->nfh_fh, nfhp->nfh_len, mode, 1, cred, p, NULL, &op, &newone, &ret, 1, true); if (error) { return (error); } if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; else clidrev = 0; if (ret == NFSCLOPEN_DOOPEN) { if (np->n_v4 != NULL) { /* * For the first attempt, try and get a layout, if * pNFS is enabled for the mount. */ if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 || nfs_numnfscbd == 0 || (np->n_flag & NNOLAYOUT) != 0 || retrycnt > 0) error = nfsrpc_openrpc(nmp, vp, np->n_v4->n4_data, np->n_v4->n4_fhlen, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, mode, op, NFS4NODENAME(np->n_v4), np->n_v4->n4_namelen, &dp, 0, 0x0, cred, p, 0, 0); else error = nfsrpc_getopenlayout(nmp, vp, np->n_v4->n4_data, np->n_v4->n4_fhlen, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, mode, op, NFS4NODENAME(np->n_v4), np->n_v4->n4_namelen, &dp, cred, p); if (dp != NULL) { #ifdef APPLE OSBitAndAtomic((int32_t)~NDELEGMOD, (UInt32 *)&np->n_flag); #else NFSLOCKNODE(np); np->n_flag &= ~NDELEGMOD; /* * Invalidate the attribute cache, so that * attributes that pre-date the issue of a * delegation are not cached, since the * cached attributes will remain valid while * the delegation is held. */ NFSINVALATTRCACHE(np); NFSUNLOCKNODE(np); #endif (void) nfscl_deleg(nmp->nm_mountp, op->nfso_own->nfsow_clp, nfhp->nfh_fh, nfhp->nfh_len, cred, p, &dp); } } else { error = EIO; } newnfs_copyincred(cred, &op->nfso_cred); } else if (ret == NFSCLOPEN_SETCRED) /* * This is a new local open on a delegation. It needs * to have credentials so that an open can be done * against the server during recovery. */ newnfs_copyincred(cred, &op->nfso_cred); /* * nfso_opencnt is the count of how many VOP_OPEN()s have * been done on this Open successfully and a VOP_CLOSE() * is expected for each of these. * If error is non-zero, don't increment it, since the Open * hasn't succeeded yet. */ if (!error) { op->nfso_opencnt++; if (NFSHASNFSV4N(nmp) && NFSHASONEOPENOWN(nmp)) { NFSLOCKNODE(np); np->n_openstateid = op; NFSUNLOCKNODE(np); } } nfscl_openrelease(nmp, op, error, newone); if (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_open"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); retrycnt++; } } while (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; return (error); } /* * the actual open rpc */ int nfsrpc_openrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, u_int8_t *newfhp, int newfhlen, u_int32_t mode, struct nfsclopen *op, u_int8_t *name, int namelen, struct nfscldeleg **dpp, int reclaim, u_int32_t delegtype, struct ucred *cred, NFSPROC_T *p, int syscred, int recursed) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfscldeleg *dp, *ndp = NULL; struct nfsvattr nfsva; u_int32_t rflags, deleg; nfsattrbit_t attrbits; int error, ret, acesize, limitby; struct nfsclsession *tsep; dp = *dpp; *dpp = NULL; nfscl_reqstart(nd, NFSPROC_OPEN, nmp, nfhp, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); *tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH); *tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; (void) nfsm_strtom(nd, op->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_NOCREATE); if (reclaim) { *tl = txdr_unsigned(NFSV4OPEN_CLAIMPREVIOUS); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(delegtype); } else { if (dp != NULL) { *tl = txdr_unsigned(NFSV4OPEN_CLAIMDELEGATECUR); NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = dp->nfsdl_stateid.seqid; *tl++ = dp->nfsdl_stateid.other[0]; *tl++ = dp->nfsdl_stateid.other[1]; *tl = dp->nfsdl_stateid.other[2]; } else { *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); } (void) nfsm_strtom(nd, name, namelen); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY); (void) nfsrv_putattrbit(nd, &attrbits); if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd); if (!nd->nd_repstat) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); op->nfso_stateid.seqid = *tl++; op->nfso_stateid.other[0] = *tl++; op->nfso_stateid.other[1] = *tl++; op->nfso_stateid.other[2] = *tl; rflags = fxdr_unsigned(u_int32_t, *(tl + 6)); error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); deleg = fxdr_unsigned(u_int32_t, *tl); if (deleg == NFSV4OPEN_DELEGATEREAD || deleg == NFSV4OPEN_DELEGATEWRITE) { if (!(op->nfso_own->nfsow_clp->nfsc_flags & NFSCLFLAGS_FIRSTDELEG)) op->nfso_own->nfsow_clp->nfsc_flags |= (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG); ndp = malloc( sizeof (struct nfscldeleg) + newfhlen, M_NFSCLDELEG, M_WAITOK); LIST_INIT(&ndp->nfsdl_owner); LIST_INIT(&ndp->nfsdl_lock); ndp->nfsdl_clp = op->nfso_own->nfsow_clp; ndp->nfsdl_fhlen = newfhlen; NFSBCOPY(newfhp, ndp->nfsdl_fh, newfhlen); newnfs_copyincred(cred, &ndp->nfsdl_cred); nfscl_lockinit(&ndp->nfsdl_rwlock); NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); ndp->nfsdl_stateid.seqid = *tl++; ndp->nfsdl_stateid.other[0] = *tl++; ndp->nfsdl_stateid.other[1] = *tl++; ndp->nfsdl_stateid.other[2] = *tl++; ret = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEWRITE) { ndp->nfsdl_flags = NFSCLDL_WRITE; /* * Indicates how much the file can grow. */ NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); limitby = fxdr_unsigned(int, *tl++); switch (limitby) { case NFSV4OPEN_LIMITSIZE: ndp->nfsdl_sizelimit = fxdr_hyper(tl); break; case NFSV4OPEN_LIMITBLOCKS: ndp->nfsdl_sizelimit = fxdr_unsigned(u_int64_t, *tl++); ndp->nfsdl_sizelimit *= fxdr_unsigned(u_int64_t, *tl); break; default: error = NFSERR_BADXDR; goto nfsmout; } } else { ndp->nfsdl_flags = NFSCLDL_READ; } if (ret) ndp->nfsdl_flags |= NFSCLDL_RECALL; error = nfsrv_dissectace(nd, &ndp->nfsdl_ace, &ret, &acesize, p); if (error) goto nfsmout; } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, cred); if (error) goto nfsmout; if (ndp != NULL) { ndp->nfsdl_change = nfsva.na_filerev; ndp->nfsdl_modtime = nfsva.na_mtime; ndp->nfsdl_flags |= NFSCLDL_MODTIMESET; } if (!reclaim && (rflags & NFSV4OPEN_RESULTCONFIRM)) { do { ret = nfsrpc_openconfirm(vp, newfhp, newfhlen, op, cred, p); if (ret == NFSERR_DELAY) (void) nfs_catnap(PZERO, ret, "nfs_open"); } while (ret == NFSERR_DELAY); error = ret; } if ((rflags & NFSV4OPEN_LOCKTYPEPOSIX) || nfscl_assumeposixlocks) op->nfso_posixlock = 1; else op->nfso_posixlock = 0; /* * If the server is handing out delegations, but we didn't * get one because an OpenConfirm was required, try the * Open again, to get a delegation. This is a harmless no-op, * from a server's point of view. */ if (!reclaim && (rflags & NFSV4OPEN_RESULTCONFIRM) && (op->nfso_own->nfsow_clp->nfsc_flags & NFSCLFLAGS_GOTDELEG) && !error && dp == NULL && ndp == NULL && !recursed) { do { ret = nfsrpc_openrpc(nmp, vp, nfhp, fhlen, newfhp, newfhlen, mode, op, name, namelen, &ndp, 0, 0x0, cred, p, syscred, 1); if (ret == NFSERR_DELAY) (void) nfs_catnap(PZERO, ret, "nfs_open2"); } while (ret == NFSERR_DELAY); if (ret) { if (ndp != NULL) { free(ndp, M_NFSCLDELEG); ndp = NULL; } if (ret == NFSERR_STALECLIENTID || ret == NFSERR_STALEDONTRECOVER || ret == NFSERR_BADSESSION) error = ret; } } } if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; if (error == NFSERR_STALECLIENTID) nfscl_initiate_recovery(op->nfso_own->nfsow_clp); nfsmout: if (!error) *dpp = ndp; else if (ndp != NULL) free(ndp, M_NFSCLDELEG); m_freem(nd->nd_mrep); return (error); } /* * open downgrade rpc */ int nfsrpc_opendowngrade(vnode_t vp, u_int32_t mode, struct nfsclopen *op, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; int error; NFSCL_REQSTART(nd, NFSPROC_OPENDOWNGRADE, vp); NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 3 * NFSX_UNSIGNED); if (NFSHASNFSV4N(VFSTONFS(vp->v_mount))) *tl++ = 0; else *tl++ = op->nfso_stateid.seqid; *tl++ = op->nfso_stateid.other[0]; *tl++ = op->nfso_stateid.other[1]; *tl++ = op->nfso_stateid.other[2]; *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); *tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH); *tl = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH); error = nfscl_request(nd, vp, p, cred, NULL); if (error) return (error); NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd); if (!nd->nd_repstat) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); op->nfso_stateid.seqid = *tl++; op->nfso_stateid.other[0] = *tl++; op->nfso_stateid.other[1] = *tl++; op->nfso_stateid.other[2] = *tl; } if (nd->nd_repstat && error == 0) error = nd->nd_repstat; if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(op->nfso_own->nfsow_clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * V4 Close operation. */ int nfsrpc_close(vnode_t vp, int doclose, NFSPROC_T *p) { struct nfsclclient *clp; int error; if (vnode_vtype(vp) != VREG) return (0); if (doclose) error = nfscl_doclose(vp, &clp, p); else { error = nfscl_getclose(vp, &clp); if (error == 0) nfscl_clientrelease(clp); } return (error); } /* * Close the open. */ int nfsrpc_doclose(struct nfsmount *nmp, struct nfsclopen *op, NFSPROC_T *p, bool loop_on_delayed, bool freeop) { struct nfsrv_descript nfsd, *nd = &nfsd; struct nfscllockowner *lp, *nlp; struct nfscllock *lop, *nlop; struct ucred *tcred; u_int64_t off = 0, len = 0; u_int32_t type = NFSV4LOCKT_READ; int error, do_unlock, trycnt; tcred = newnfs_getcred(); newnfs_copycred(&op->nfso_cred, tcred); /* * (Theoretically this could be done in the same * compound as the close, but having multiple * sequenced Ops in the same compound might be * too scary for some servers.) */ if (op->nfso_posixlock) { off = 0; len = NFS64BITSSET; type = NFSV4LOCKT_READ; } /* * Since this function is only called from VOP_INACTIVE(), no * other thread will be manipulating this Open. As such, the * lock lists are not being changed by other threads, so it should * be safe to do this without locking. */ LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) { do_unlock = 1; LIST_FOREACH_SAFE(lop, &lp->nfsl_lock, nfslo_list, nlop) { if (op->nfso_posixlock == 0) { off = lop->nfslo_first; len = lop->nfslo_end - lop->nfslo_first; if (lop->nfslo_type == F_WRLCK) type = NFSV4LOCKT_WRITE; else type = NFSV4LOCKT_READ; } if (do_unlock) { trycnt = 0; do { error = nfsrpc_locku(nd, nmp, lp, off, len, type, tcred, p, 0); if ((nd->nd_repstat == NFSERR_GRACE || nd->nd_repstat == NFSERR_DELAY) && error == 0) (void) nfs_catnap(PZERO, (int)nd->nd_repstat, "nfs_close"); } while ((nd->nd_repstat == NFSERR_GRACE || nd->nd_repstat == NFSERR_DELAY) && error == 0 && trycnt++ < 5); if (op->nfso_posixlock) do_unlock = 0; } nfscl_freelock(lop, 0); } /* * Do a ReleaseLockOwner. * The lock owner name nfsl_owner may be used by other opens for * other files but the lock_owner4 name that nfsrpc_rellockown() * puts on the wire has the file handle for this file appended * to it, so it can be done now. */ (void)nfsrpc_rellockown(nmp, lp, lp->nfsl_open->nfso_fh, lp->nfsl_open->nfso_fhlen, tcred, p); } /* * There could be other Opens for different files on the same * OpenOwner, so locking is required. */ NFSLOCKCLSTATE(); nfscl_lockexcl(&op->nfso_own->nfsow_rwlock, NFSCLSTATEMUTEXPTR); NFSUNLOCKCLSTATE(); do { error = nfscl_tryclose(op, tcred, nmp, p, loop_on_delayed); if (error == NFSERR_GRACE) (void) nfs_catnap(PZERO, error, "nfs_close"); } while (error == NFSERR_GRACE); NFSLOCKCLSTATE(); nfscl_lockunlock(&op->nfso_own->nfsow_rwlock); LIST_FOREACH_SAFE(lp, &op->nfso_lock, nfsl_list, nlp) nfscl_freelockowner(lp, 0); if (freeop && error != NFSERR_DELAY) nfscl_freeopen(op, 0, true); NFSUNLOCKCLSTATE(); NFSFREECRED(tcred); return (error); } /* * The actual Close RPC. */ int nfsrpc_closerpc(struct nfsrv_descript *nd, struct nfsmount *nmp, struct nfsclopen *op, struct ucred *cred, NFSPROC_T *p, int syscred) { u_int32_t *tl; int error; nfscl_reqstart(nd, NFSPROC_CLOSE, nmp, op->nfso_fh, op->nfso_fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID); if (NFSHASNFSV4N(nmp)) { *tl++ = 0; *tl++ = 0; } else { *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); *tl++ = op->nfso_stateid.seqid; } *tl++ = op->nfso_stateid.other[0]; *tl++ = op->nfso_stateid.other[1]; *tl = op->nfso_stateid.other[2]; if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); if (!NFSHASNFSV4N(nmp)) NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd); if (nd->nd_repstat == 0) NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); error = nd->nd_repstat; if (!NFSHASNFSV4N(nmp) && error == NFSERR_STALESTATEID) nfscl_initiate_recovery(op->nfso_own->nfsow_clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * V4 Open Confirm RPC. */ int nfsrpc_openconfirm(vnode_t vp, u_int8_t *nfhp, int fhlen, struct nfsclopen *op, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; int error; nmp = VFSTONFS(vp->v_mount); if (NFSHASNFSV4N(nmp)) return (0); /* No confirmation for NFSv4.1. */ nfscl_reqstart(nd, NFSPROC_OPENCONFIRM, nmp, nfhp, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID); *tl++ = op->nfso_stateid.seqid; *tl++ = op->nfso_stateid.other[0]; *tl++ = op->nfso_stateid.other[1]; *tl++ = op->nfso_stateid.other[2]; *tl = txdr_unsigned(op->nfso_own->nfsow_seqid); error = nfscl_request(nd, vp, p, cred, NULL); if (error) return (error); NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd); if (!nd->nd_repstat) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); op->nfso_stateid.seqid = *tl++; op->nfso_stateid.other[0] = *tl++; op->nfso_stateid.other[1] = *tl++; op->nfso_stateid.other[2] = *tl; } error = nd->nd_repstat; if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(op->nfso_own->nfsow_clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Do the setclientid and setclientid confirm RPCs. Called from nfs_statfs() * when a mount has just occurred and when the server replies NFSERR_EXPIRED. */ int nfsrpc_setclient(struct nfsmount *nmp, struct nfsclclient *clp, int reclaim, bool *retokp, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; u_int8_t *cp = NULL, *cp2, addr[INET6_ADDRSTRLEN + 9]; u_short port; int error, isinet6 = 0, callblen; nfsquad_t confirm; static u_int32_t rev = 0; struct nfsclds *dsp, *odsp; struct in6_addr a6; struct nfsclsession *tsep; struct rpc_reconupcall recon; struct nfscl_reconarg *rcp; if (nfsboottime.tv_sec == 0) NFSSETBOOTTIME(nfsboottime); if (NFSHASNFSV4N(nmp)) { error = NFSERR_BADSESSION; odsp = dsp = NULL; if (retokp != NULL) { NFSLOCKMNT(nmp); odsp = TAILQ_FIRST(&nmp->nm_sess); NFSUNLOCKMNT(nmp); } if (odsp != NULL) { /* * When a session already exists, first try a * CreateSession with the extant ClientID. */ dsp = malloc(sizeof(struct nfsclds) + odsp->nfsclds_servownlen + 1, M_NFSCLDS, M_WAITOK | M_ZERO); dsp->nfsclds_expire = NFSD_MONOSEC + clp->nfsc_renew; dsp->nfsclds_servownlen = odsp->nfsclds_servownlen; dsp->nfsclds_sess.nfsess_clientid = odsp->nfsclds_sess.nfsess_clientid; dsp->nfsclds_sess.nfsess_sequenceid = odsp->nfsclds_sess.nfsess_sequenceid; dsp->nfsclds_flags = odsp->nfsclds_flags; if (dsp->nfsclds_servownlen > 0) memcpy(dsp->nfsclds_serverown, odsp->nfsclds_serverown, dsp->nfsclds_servownlen + 1); mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF); mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession", NULL, MTX_DEF); nfscl_initsessionslots(&dsp->nfsclds_sess); error = nfsrpc_createsession(nmp, &dsp->nfsclds_sess, &nmp->nm_sockreq, NULL, dsp->nfsclds_sess.nfsess_sequenceid, 1, cred, p); NFSCL_DEBUG(1, "create session for extant " "ClientID=%d\n", error); if (error != 0) { nfscl_freenfsclds(dsp); dsp = NULL; /* * If *retokp is true, return any error other * than NFSERR_STALECLIENTID, * NFSERR_BADSESSION or NFSERR_STALEDONTRECOVER * so that nfscl_recover() will not loop. */ if (*retokp) return (NFSERR_IO); } else *retokp = true; } else if (retokp != NULL && *retokp) return (NFSERR_IO); if (error != 0) { /* * Either there was no previous session or the * CreateSession attempt failed, so... * do an ExchangeID followed by the CreateSession. */ clp->nfsc_rev = rev++; error = nfsrpc_exchangeid(nmp, clp, &nmp->nm_sockreq, 0, NFSV4EXCH_USEPNFSMDS | NFSV4EXCH_USENONPNFS, &dsp, cred, p); NFSCL_DEBUG(1, "aft exch=%d\n", error); if (error == 0) error = nfsrpc_createsession(nmp, &dsp->nfsclds_sess, &nmp->nm_sockreq, NULL, dsp->nfsclds_sess.nfsess_sequenceid, 1, cred, p); NFSCL_DEBUG(1, "aft createsess=%d\n", error); } if (error == 0) { /* * If the session supports a backchannel, set up * the BindConnectionToSession call in the krpc * so that it is done on a reconnection. */ if (nfscl_enablecallb != 0 && nfs_numnfscbd > 0) { rcp = mem_alloc(sizeof(*rcp)); rcp->minorvers = nmp->nm_minorvers; memcpy(rcp->sessionid, dsp->nfsclds_sess.nfsess_sessionid, NFSX_V4SESSIONID); recon.call = nfsrpc_bindconnsess; recon.arg = rcp; CLNT_CONTROL(nmp->nm_client, CLSET_RECONUPCALL, &recon); } NFSLOCKMNT(nmp); /* * The old sessions cannot be safely free'd * here, since they may still be used by * in-progress RPCs. */ tsep = NULL; if (TAILQ_FIRST(&nmp->nm_sess) != NULL) tsep = NFSMNT_MDSSESSION(nmp); TAILQ_INSERT_HEAD(&nmp->nm_sess, dsp, nfsclds_list); /* * Wake up RPCs waiting for a slot on the * old session. These will then fail with * NFSERR_BADSESSION and be retried with the * new session by nfsv4_setsequence(). * Also wakeup() processes waiting for the * new session. */ if (tsep != NULL) wakeup(&tsep->nfsess_slots); wakeup(&nmp->nm_sess); NFSUNLOCKMNT(nmp); } else if (dsp != NULL) nfscl_freenfsclds(dsp); if (error == 0 && reclaim == 0) { error = nfsrpc_reclaimcomplete(nmp, cred, p); NFSCL_DEBUG(1, "aft reclaimcomp=%d\n", error); if (error == NFSERR_COMPLETEALREADY || error == NFSERR_NOTSUPP) /* Ignore this error. */ error = 0; } return (error); } else if (retokp != NULL && *retokp) return (NFSERR_IO); clp->nfsc_rev = rev++; /* * Allocate a single session structure for NFSv4.0, because some of * the fields are used by NFSv4.0 although it doesn't do a session. */ dsp = malloc(sizeof(struct nfsclds), M_NFSCLDS, M_WAITOK | M_ZERO); mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF); mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession", NULL, MTX_DEF); NFSLOCKMNT(nmp); TAILQ_INSERT_HEAD(&nmp->nm_sess, dsp, nfsclds_list); tsep = NFSMNT_MDSSESSION(nmp); NFSUNLOCKMNT(nmp); nfscl_reqstart(nd, NFSPROC_SETCLIENTID, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(nfsboottime.tv_sec); *tl = txdr_unsigned(clp->nfsc_rev); (void) nfsm_strtom(nd, clp->nfsc_id, clp->nfsc_idlen); /* * set up the callback address */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFS_CALLBCKPROG); callblen = strlen(nfsv4_callbackaddr); if (callblen == 0) cp = nfscl_getmyip(nmp, &a6, &isinet6); if (nfscl_enablecallb && nfs_numnfscbd > 0 && (callblen > 0 || cp != NULL)) { port = htons(nfsv4_cbport); cp2 = (u_int8_t *)&port; #ifdef INET6 if ((callblen > 0 && strchr(nfsv4_callbackaddr, ':')) || isinet6) { char ip6buf[INET6_ADDRSTRLEN], *ip6add; (void) nfsm_strtom(nd, "tcp6", 4); if (callblen == 0) { ip6_sprintf(ip6buf, (struct in6_addr *)cp); ip6add = ip6buf; } else { ip6add = nfsv4_callbackaddr; } snprintf(addr, INET6_ADDRSTRLEN + 9, "%s.%d.%d", ip6add, cp2[0], cp2[1]); } else #endif { (void) nfsm_strtom(nd, "tcp", 3); if (callblen == 0) snprintf(addr, INET6_ADDRSTRLEN + 9, "%d.%d.%d.%d.%d.%d", cp[0], cp[1], cp[2], cp[3], cp2[0], cp2[1]); else snprintf(addr, INET6_ADDRSTRLEN + 9, "%s.%d.%d", nfsv4_callbackaddr, cp2[0], cp2[1]); } (void) nfsm_strtom(nd, addr, strlen(addr)); } else { (void) nfsm_strtom(nd, "tcp", 3); (void) nfsm_strtom(nd, "0.0.0.0.0.0", 11); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(clp->nfsc_cbident); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); tsep->nfsess_clientid.lval[0] = *tl++; tsep->nfsess_clientid.lval[1] = *tl++; confirm.lval[0] = *tl++; confirm.lval[1] = *tl; m_freem(nd->nd_mrep); nd->nd_mrep = NULL; /* * and confirm it. */ nfscl_reqstart(nd, NFSPROC_SETCLIENTIDCFRM, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED); *tl++ = tsep->nfsess_clientid.lval[0]; *tl++ = tsep->nfsess_clientid.lval[1]; *tl++ = confirm.lval[0]; *tl = confirm.lval[1]; nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); m_freem(nd->nd_mrep); nd->nd_mrep = NULL; } error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs getattr call. */ int nfsrpc_getattr(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, void *stuff) { struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfsattrbit_t attrbits; NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp); if (nd->nd_flag & ND_NFSV4) { NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (!nd->nd_repstat) error = nfsm_loadattr(nd, nap); else error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * nfs getattr call with non-vnode arguments. */ int nfsrpc_getattrnovp(struct nfsmount *nmp, u_int8_t *fhp, int fhlen, int syscred, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, u_int64_t *xidp, uint32_t *leasep) { struct nfsrv_descript nfsd, *nd = &nfsd; int error, vers = NFS_VER2; nfsattrbit_t attrbits; nfscl_reqstart(nd, NFSPROC_GETATTR, nmp, fhp, fhlen, NULL, NULL, 0, 0); if (nd->nd_flag & ND_NFSV4) { vers = NFS_VER4; NFSGETATTR_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_LEASETIME); (void) nfsrv_putattrbit(nd, &attrbits); } else if (nd->nd_flag & ND_NFSV3) { vers = NFS_VER3; } if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, vers, NULL, 1, xidp, NULL); if (error) return (error); if (nd->nd_repstat == 0) { if ((nd->nd_flag & ND_NFSV4) != 0) error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, leasep, NULL, NULL, NULL); else error = nfsm_loadattr(nd, nap); } else error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Do an nfs setattr operation. */ int nfsrpc_setattr(vnode_t vp, struct vattr *vap, NFSACL_T *aclp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *rnap, int *attrflagp, void *stuff) { int error, expireret = 0, openerr, retrycnt; u_int32_t clidrev = 0, mode; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsfh *nfhp; nfsv4stateid_t stateid; void *lckp; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; if (vap != NULL && NFSATTRISSET(u_quad_t, vap, va_size)) mode = NFSV4OPEN_ACCESSWRITE; else mode = NFSV4OPEN_ACCESSREAD; retrycnt = 0; do { lckp = NULL; openerr = 1; if (NFSHASNFSV4(nmp)) { nfhp = VTONFS(vp)->n_fhp; error = nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, mode, 0, cred, p, &stateid, &lckp); if (error && vnode_vtype(vp) == VREG && (mode == NFSV4OPEN_ACCESSWRITE || nfstest_openallsetattr)) { /* * No Open stateid, so try and open the file * now. */ if (mode == NFSV4OPEN_ACCESSWRITE) openerr = nfsrpc_open(vp, FWRITE, cred, p); else openerr = nfsrpc_open(vp, FREAD, cred, p); if (!openerr) (void) nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, mode, 0, cred, p, &stateid, &lckp); } } if (vap != NULL) error = nfsrpc_setattrrpc(vp, vap, &stateid, cred, p, rnap, attrflagp, stuff); else error = nfsrpc_setaclrpc(vp, cred, p, aclp, &stateid, stuff); if (error == NFSERR_OPENMODE && mode == NFSV4OPEN_ACCESSREAD) { NFSLOCKMNT(nmp); nmp->nm_state |= NFSSTA_OPENMODE; NFSUNLOCKMNT(nmp); } if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (!openerr) (void) nfsrpc_close(vp, 0, p); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_setattr"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4) || (error == NFSERR_OPENMODE && mode == NFSV4OPEN_ACCESSREAD && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; return (error); } static int nfsrpc_setattrrpc(vnode_t vp, struct vattr *vap, nfsv4stateid_t *stateidp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *rnap, int *attrflagp, void *stuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_SETATTR, vp); if (nd->nd_flag & ND_NFSV4) nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); vap->va_type = vnode_vtype(vp); nfscl_fillsattr(nd, vap, vp, NFSSATTR_FULL, 0); if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_false; } else if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) error = nfscl_wcc_data(nd, vp, rnap, attrflagp, NULL, stuff); if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4 && !error) error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (!(nd->nd_flag & ND_NFSV3) && !nd->nd_repstat && !error) error = nfscl_postop_attr(nd, rnap, attrflagp, stuff); m_freem(nd->nd_mrep); if (nd->nd_repstat && !error) error = nd->nd_repstat; return (error); } /* * nfs lookup rpc */ int nfsrpc_lookup(vnode_t dvp, char *name, int len, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *stuff, uint32_t openmode) { uint32_t deleg, rflags, *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; struct nfsnode *np; struct nfsfh *nfhp; nfsattrbit_t attrbits; int error = 0, lookupp = 0, newone, ret, retop; uint8_t own[NFSV4CL_LOCKNAMELEN]; struct nfsclopen *op; struct nfscldeleg *ndp; nfsv4stateid_t stateid; *attrflagp = 0; *dattrflagp = 0; if (vnode_vtype(dvp) != VDIR) return (ENOTDIR); nmp = VFSTONFS(dvp->v_mount); if (len > NFS_MAXNAMLEN) return (ENAMETOOLONG); if (NFSHASNFSV4(nmp) && len == 1 && name[0] == '.') { /* * Just return the current dir's fh. */ np = VTONFS(dvp); nfhp = malloc(sizeof (struct nfsfh) + np->n_fhp->nfh_len, M_NFSFH, M_WAITOK); nfhp->nfh_len = np->n_fhp->nfh_len; NFSBCOPY(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len); *nfhpp = nfhp; return (0); } if (NFSHASNFSV4(nmp) && len == 2 && name[0] == '.' && name[1] == '.') { lookupp = 1; openmode = 0; NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, dvp); } else if (openmode != 0) { NFSCL_REQSTART(nd, NFSPROC_LOOKUPOPEN, dvp); nfsm_strtom(nd, name, len); } else { NFSCL_REQSTART(nd, NFSPROC_LOOKUP, dvp); (void) nfsm_strtom(nd, name, len); } if (nd->nd_flag & ND_NFSV4) { NFSGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); if (openmode != 0) { /* Test for a VREG file. */ NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TYPE); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_VERIFY); nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSX_UNSIGNED); *tl = vtonfsv34_type(VREG); /* Attempt the Open for VREG. */ nfscl_filllockowner(NULL, own, F_POSIX); NFSM_BUILD(tl, uint32_t *, 6 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_OPEN); *tl++ = 0; /* seqid, ignored. */ *tl++ = txdr_unsigned(openmode); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); *tl++ = 0; /* ClientID, ignored. */ *tl = 0; nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_NOCREATE); *tl = txdr_unsigned(NFSV4OPEN_CLAIMFH); } } error = nfscl_request(nd, dvp, p, cred, stuff); if (error) return (error); ndp = NULL; if (nd->nd_repstat) { /* * When an NFSv4 Lookupp returns ENOENT, it means that * the lookup is at the root of an fs, so return this dir. */ if (nd->nd_repstat == NFSERR_NOENT && lookupp) { np = VTONFS(dvp); nfhp = malloc(sizeof (struct nfsfh) + np->n_fhp->nfh_len, M_NFSFH, M_WAITOK); nfhp->nfh_len = np->n_fhp->nfh_len; NFSBCOPY(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len); *nfhpp = nfhp; m_freem(nd->nd_mrep); return (0); } if (nd->nd_flag & ND_NFSV3) error = nfscl_postop_attr(nd, dnap, dattrflagp, stuff); else if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { /* Load the directory attributes. */ error = nfsm_loadattr(nd, dnap); if (error == 0) *dattrflagp = 1; else goto nfsmout; } /* Check Lookup operation reply status. */ if (openmode != 0 && (nd->nd_flag & ND_NOMOREDATA) == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) goto nfsmout; } /* Look for GetFH reply. */ if (openmode != 0 && (nd->nd_flag & ND_NOMOREDATA) == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) goto nfsmout; error = nfsm_getfh(nd, nfhpp); if (error) goto nfsmout; } /* Look for Getattr reply. */ if (openmode != 0 && (nd->nd_flag & ND_NOMOREDATA) == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) goto nfsmout; error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (error == 0) /* Successfully got Lookup done. */ nd->nd_repstat = 0; } goto nfsmout; } if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { /* Load the directory attributes. */ error = nfsm_loadattr(nd, dnap); if (error != 0) goto nfsmout; *dattrflagp = 1; /* Skip over the Lookup and GetFH operation status values. */ NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); } error = nfsm_getfh(nd, nfhpp); if (error) goto nfsmout; error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (openmode != 0 && error == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID + 10 * NFSX_UNSIGNED); tl += 4; /* Skip over Verify+Open status. */ stateid.seqid = *tl++; stateid.other[0] = *tl++; stateid.other[1] = *tl++; stateid.other[2] = *tl; rflags = fxdr_unsigned(uint32_t, *(tl + 6)); error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error != 0) goto nfsmout; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); deleg = fxdr_unsigned(uint32_t, *tl); if (deleg == NFSV4OPEN_DELEGATEREAD || deleg == NFSV4OPEN_DELEGATEWRITE) { /* * Just need to fill in the fields used by * nfscl_trydelegreturn(). * Mark the mount point as acquiring * delegations, so NFSPROC_LOOKUPOPEN will * no longer be done. */ NFSLOCKMNT(nmp); nmp->nm_privflag |= NFSMNTP_DELEGISSUED; NFSUNLOCKMNT(nmp); ndp = malloc(sizeof(struct nfscldeleg) + (*nfhpp)->nfh_len, M_NFSCLDELEG, M_WAITOK); ndp->nfsdl_fhlen = (*nfhpp)->nfh_len; NFSBCOPY((*nfhpp)->nfh_fh, ndp->nfsdl_fh, ndp->nfsdl_fhlen); newnfs_copyincred(cred, &ndp->nfsdl_cred); NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID); ndp->nfsdl_stateid.seqid = *tl++; ndp->nfsdl_stateid.other[0] = *tl++; ndp->nfsdl_stateid.other[1] = *tl++; ndp->nfsdl_stateid.other[2] = *tl++; } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; } ret = nfscl_open(dvp, (*nfhpp)->nfh_fh, (*nfhpp)->nfh_len, openmode, 0, cred, p, NULL, &op, &newone, &retop, 1, true); if (ret != 0) goto nfsmout; if (newone != 0) { op->nfso_stateid.seqid = stateid.seqid; op->nfso_stateid.other[0] = stateid.other[0]; op->nfso_stateid.other[1] = stateid.other[1]; op->nfso_stateid.other[2] = stateid.other[2]; op->nfso_mode = openmode; } else { op->nfso_stateid.seqid = stateid.seqid; if (retop == NFSCLOPEN_DOOPEN) op->nfso_mode |= openmode; } if ((rflags & NFSV4OPEN_LOCKTYPEPOSIX) != 0 || nfscl_assumeposixlocks) op->nfso_posixlock = 1; else op->nfso_posixlock = 0; nfscl_openrelease(nmp, op, 0, 0); if (ndp != NULL) { /* * Since we do not have the vnode, we * cannot invalidate cached attributes. * Just return the delegation. */ nfscl_trydelegreturn(ndp, cred, nmp, p); } } if ((nd->nd_flag & ND_NFSV3) && !error) error = nfscl_postop_attr(nd, dnap, dattrflagp, stuff); nfsmout: m_freem(nd->nd_mrep); if (!error && nd->nd_repstat) error = nd->nd_repstat; free(ndp, M_NFSCLDELEG); return (error); } /* * Do a readlink rpc. */ int nfsrpc_readlink(vnode_t vp, struct uio *uiop, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsnode *np = VTONFS(vp); nfsattrbit_t attrbits; int error, len, cangetattr = 1; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_READLINK, vp); if (nd->nd_flag & ND_NFSV4) { /* * And do a Getattr op. */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (!nd->nd_repstat && !error) { NFSM_STRSIZ(len, NFS_MAXPATHLEN); /* * This seems weird to me, but must have been added to * FreeBSD for some reason. The only thing I can think of * is that there was/is some server that replies with * more link data than it should? */ if (len == NFS_MAXPATHLEN) { NFSLOCKNODE(np); if (np->n_size > 0 && np->n_size < NFS_MAXPATHLEN) { len = np->n_size; cangetattr = 0; } NFSUNLOCKNODE(np); } error = nfsm_mbufuio(nd, uiop, len); if ((nd->nd_flag & ND_NFSV4) && !error && cangetattr) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); } if (nd->nd_repstat && !error) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Read operation. */ int nfsrpc_read(vnode_t vp, struct uio *uiop, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { int error, expireret = 0, retrycnt; u_int32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *np = VTONFS(vp); struct ucred *newcred; struct nfsfh *nfhp = NULL; nfsv4stateid_t stateid; void *lckp; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; newcred = cred; if (NFSHASNFSV4(nmp)) { nfhp = np->n_fhp; newcred = NFSNEWCRED(cred); } retrycnt = 0; do { lckp = NULL; if (NFSHASNFSV4(nmp)) (void)nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, NFSV4OPEN_ACCESSREAD, 0, newcred, p, &stateid, &lckp); error = nfsrpc_readrpc(vp, uiop, newcred, &stateid, p, nap, attrflagp, stuff); if (error == NFSERR_OPENMODE) { NFSLOCKMNT(nmp); nmp->nm_state |= NFSSTA_OPENMODE; NFSUNLOCKMNT(nmp); } if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_read"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4) || (error == NFSERR_OPENMODE && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; if (NFSHASNFSV4(nmp)) NFSFREECRED(newcred); return (error); } /* * The actual read RPC. */ static int nfsrpc_readrpc(vnode_t vp, struct uio *uiop, struct ucred *cred, nfsv4stateid_t *stateidp, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl; int error = 0, len, retlen, tsiz, eof = 0; struct nfsrv_descript nfsd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsrv_descript *nd = &nfsd; int rsize; off_t tmp_off; *attrflagp = 0; tsiz = uiop->uio_resid; tmp_off = uiop->uio_offset + tsiz; NFSLOCKMNT(nmp); if (tmp_off > nmp->nm_maxfilesize || tmp_off < uiop->uio_offset) { NFSUNLOCKMNT(nmp); return (EFBIG); } rsize = nmp->nm_rsize; NFSUNLOCKMNT(nmp); nd->nd_mrep = NULL; while (tsiz > 0) { *attrflagp = 0; len = (tsiz > rsize) ? rsize : tsiz; NFSCL_REQSTART(nd, NFSPROC_READ, vp); if (nd->nd_flag & ND_NFSV4) nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED * 3); if (nd->nd_flag & ND_NFSV2) { *tl++ = txdr_unsigned(uiop->uio_offset); *tl++ = txdr_unsigned(len); *tl = 0; } else { txdr_hyper(uiop->uio_offset, tl); *(tl + 2) = txdr_unsigned(len); } /* * Since I can't do a Getattr for NFSv4 for Write, there * doesn't seem any point in doing one here, either. * (See the comment in nfsrpc_writerpc() for more info.) */ error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) { error = nfscl_postop_attr(nd, nap, attrflagp, stuff); } else if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV2)) { error = nfsm_loadattr(nd, nap); if (!error) *attrflagp = 1; } if (nd->nd_repstat || error) { if (!error) error = nd->nd_repstat; goto nfsmout; } if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); eof = fxdr_unsigned(int, *(tl + 1)); } else if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); eof = fxdr_unsigned(int, *tl); } NFSM_STRSIZ(retlen, len); error = nfsm_mbufuio(nd, uiop, retlen); if (error) goto nfsmout; m_freem(nd->nd_mrep); nd->nd_mrep = NULL; tsiz -= retlen; if (!(nd->nd_flag & ND_NFSV2)) { if (eof || retlen == 0) tsiz = 0; } else if (retlen < len) tsiz = 0; } return (0); nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); return (error); } /* * nfs write operation * When called_from_strategy != 0, it should return EIO for an error that * indicates recovery is in progress, so that the buffer will be left * dirty and be written back to the server later. If it loops around, * the recovery thread could get stuck waiting for the buffer and recovery * will then deadlock. */ int nfsrpc_write(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff, int called_from_strategy) { int error, expireret = 0, retrycnt, nostateid; u_int32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *np = VTONFS(vp); struct ucred *newcred; struct nfsfh *nfhp = NULL; nfsv4stateid_t stateid; void *lckp; *must_commit = 0; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; newcred = cred; if (NFSHASNFSV4(nmp)) { newcred = NFSNEWCRED(cred); nfhp = np->n_fhp; } retrycnt = 0; do { lckp = NULL; nostateid = 0; if (NFSHASNFSV4(nmp)) { (void)nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, NFSV4OPEN_ACCESSWRITE, 0, newcred, p, &stateid, &lckp); if (stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { nostateid = 1; NFSCL_DEBUG(1, "stateid0 in write\n"); } } /* * If there is no stateid for NFSv4, it means this is an * extraneous write after close. Basically a poorly * implemented buffer cache. Just don't do the write. */ if (nostateid) error = 0; else error = nfsrpc_writerpc(vp, uiop, iomode, must_commit, newcred, &stateid, p, nap, attrflagp, stuff); if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_write"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_DELAY || ((error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || error == NFSERR_STALEDONTRECOVER) && called_from_strategy == 0) || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error != 0 && (retrycnt >= 4 || ((error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || error == NFSERR_STALEDONTRECOVER) && called_from_strategy != 0))) error = EIO; if (NFSHASNFSV4(nmp)) NFSFREECRED(newcred); return (error); } /* * The actual write RPC. */ static int nfsrpc_writerpc(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, struct ucred *cred, nfsv4stateid_t *stateidp, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *np = VTONFS(vp); int error = 0, len, tsiz, rlen, commit, committed = NFSWRITE_FILESYNC; int wccflag = 0, wsize; int32_t backup; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; off_t tmp_off; KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1")); *attrflagp = 0; tsiz = uiop->uio_resid; tmp_off = uiop->uio_offset + tsiz; NFSLOCKMNT(nmp); if (tmp_off > nmp->nm_maxfilesize || tmp_off < uiop->uio_offset) { NFSUNLOCKMNT(nmp); return (EFBIG); } wsize = nmp->nm_wsize; NFSUNLOCKMNT(nmp); nd->nd_mrep = NULL; /* NFSv2 sometimes does a write with */ nd->nd_repstat = 0; /* uio_resid == 0, so the while is not done */ while (tsiz > 0) { *attrflagp = 0; len = (tsiz > wsize) ? wsize : tsiz; NFSCL_REQSTART(nd, NFSPROC_WRITE, vp); if (nd->nd_flag & ND_NFSV4) { nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER+2*NFSX_UNSIGNED); txdr_hyper(uiop->uio_offset, tl); tl += 2; *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); } else if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER+3*NFSX_UNSIGNED); txdr_hyper(uiop->uio_offset, tl); tl += 2; *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); } else { u_int32_t x; NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED); /* * Not sure why someone changed this, since the * RFC clearly states that "beginoffset" and * "totalcount" are ignored, but it wouldn't * surprise me if there's a busted server out there. */ /* Set both "begin" and "current" to non-garbage. */ x = txdr_unsigned((u_int32_t)uiop->uio_offset); *tl++ = x; /* "begin offset" */ *tl++ = x; /* "current offset" */ x = txdr_unsigned(len); *tl++ = x; /* total to this offset */ *tl = x; /* size of this write */ } nfsm_uiombuf(nd, uiop, len); /* * Although it is tempting to do a normal Getattr Op in the * NFSv4 compound, the result can be a nearly hung client * system if the Getattr asks for Owner and/or OwnerGroup. * It occurs when the client can't map either the Owner or * Owner_group name in the Getattr reply to a uid/gid. When * there is a cache miss, the kernel does an upcall to the * nfsuserd. Then, it can try and read the local /etc/passwd * or /etc/group file. It can then block in getnewbuf(), * waiting for dirty writes to be pushed to the NFS server. * The only reason this doesn't result in a complete * deadlock, is that the upcall times out and allows * the write to complete. However, progress is so slow * that it might just as well be deadlocked. * As such, we get the rest of the attributes, but not * Owner or Owner_group. * nb: nfscl_loadattrcache() needs to be told that these * partial attributes from a write rpc are being * passed in, via a argument flag. */ if (nd->nd_flag & ND_NFSV4) { NFSWRITEGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_repstat) { /* * In case the rpc gets retried, roll * the uio fields changed by nfsm_uiombuf() * back. */ uiop->uio_offset -= len; uiop->uio_resid += len; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - len; uiop->uio_iov->iov_len += len; } if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { error = nfscl_wcc_data(nd, vp, nap, attrflagp, &wccflag, stuff); if (error) goto nfsmout; } if (!nd->nd_repstat) { if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); rlen = fxdr_unsigned(int, *tl++); if (rlen == 0) { error = NFSERR_IO; goto nfsmout; } else if (rlen < len) { backup = len - rlen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup; uiop->uio_iov->iov_len += backup; uiop->uio_offset -= backup; uiop->uio_resid += backup; len = rlen; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest commitment level * obtained by any of the RPCs. */ if (committed == NFSWRITE_FILESYNC) committed = commit; else if (committed == NFSWRITE_DATASYNC && commit == NFSWRITE_UNSTABLE) committed = commit; NFSLOCKMNT(nmp); if (!NFSHASWRITEVERF(nmp)) { NFSBCOPY((caddr_t)tl, (caddr_t)&nmp->nm_verf[0], NFSX_VERF); NFSSETWRITEVERF(nmp); } else if (NFSBCMP(tl, nmp->nm_verf, NFSX_VERF)) { *must_commit = 1; NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); } NFSUNLOCKMNT(nmp); } if (nd->nd_flag & ND_NFSV4) NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (nd->nd_flag & (ND_NFSV2 | ND_NFSV4)) { error = nfsm_loadattr(nd, nap); if (!error) *attrflagp = NFS_LATTR_NOSHRINK; } } else { error = nd->nd_repstat; } if (error) goto nfsmout; NFSWRITERPC_SETTIME(wccflag, np, nap, (nd->nd_flag & ND_NFSV4)); m_freem(nd->nd_mrep); nd->nd_mrep = NULL; tsiz -= len; } nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); *iomode = committed; if (nd->nd_repstat && !error) error = nd->nd_repstat; return (error); } /* * Do an nfs deallocate operation. */ int nfsrpc_deallocate(vnode_t vp, off_t offs, off_t len, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p, void *stuff) { int error, expireret = 0, openerr, retrycnt; uint32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsfh *nfhp; nfsv4stateid_t stateid; void *lckp; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; retrycnt = 0; do { lckp = NULL; openerr = 1; nfhp = VTONFS(vp)->n_fhp; error = nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, NFSV4OPEN_ACCESSWRITE, 0, cred, p, &stateid, &lckp); if (error != 0) { /* * No Open stateid, so try and open the file * now. */ openerr = nfsrpc_open(vp, FWRITE, cred, p); if (openerr == 0) nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, NFSV4OPEN_ACCESSWRITE, 0, cred, p, &stateid, &lckp); } error = nfsrpc_deallocaterpc(vp, offs, len, &stateid, nap, attrflagp, cred, p, stuff); if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (openerr == 0) nfsrpc_close(vp, 0, p); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_deallocate"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; return (error); } /* * The actual deallocate RPC. */ static int nfsrpc_deallocaterpc(vnode_t vp, off_t offs, off_t len, nfsv4stateid_t *stateidp, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p, void *stuff) { uint32_t *tl; struct nfsnode *np = VTONFS(vp); int error, wccflag; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_DEALLOCATE, vp); nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER); txdr_hyper(offs, tl); tl += 2; txdr_hyper(len, tl); NFSWRITEGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, stuff); if (error != 0) return (error); wccflag = 0; error = nfscl_wcc_data(nd, vp, nap, attrflagp, &wccflag, stuff); if (error != 0) goto nfsmout; if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, nap); if (error != 0) goto nfsmout; *attrflagp = NFS_LATTR_NOSHRINK; } NFSWRITERPC_SETTIME(wccflag, np, nap, 1); nfsmout: m_freem(nd->nd_mrep); if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; return (error); } /* * nfs mknod rpc * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the * mode set to specify the file type and the size field for rdev. */ int nfsrpc_mknod(vnode_t dvp, char *name, int namelen, struct vattr *vap, u_int32_t rdev, enum vtype vtyp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff) { u_int32_t *tl; int error = 0; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t attrbits; *nfhpp = NULL; *attrflagp = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_MKNOD, dvp); if (nd->nd_flag & ND_NFSV4) { if (vtyp == VBLK || vtyp == VCHR) { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = vtonfsv34_type(vtyp); *tl++ = txdr_unsigned(NFSMAJOR(rdev)); *tl = txdr_unsigned(NFSMINOR(rdev)); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = vtonfsv34_type(vtyp); } } (void) nfsm_strtom(nd, name, namelen); if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = vtonfsv34_type(vtyp); } if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) nfscl_fillsattr(nd, vap, dvp, 0, 0); if ((nd->nd_flag & ND_NFSV3) && (vtyp == VCHR || vtyp == VBLK)) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSMAJOR(rdev)); *tl = txdr_unsigned(NFSMINOR(rdev)); } if (nd->nd_flag & ND_NFSV4) { NFSGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); } if (nd->nd_flag & ND_NFSV2) nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZERDEV, rdev); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & ND_NFSV4) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (!nd->nd_repstat) { if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error) goto nfsmout; } error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); if (error) goto nfsmout; } if (nd->nd_flag & ND_NFSV3) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (!error && nd->nd_repstat) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs file create call * Mostly just call the approriate routine. (I separated out v4, so that * error recovery wouldn't be as difficult.) */ int nfsrpc_create(vnode_t dvp, char *name, int namelen, struct vattr *vap, nfsquad_t cverf, int fmode, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff) { int error = 0, newone, expireret = 0, retrycnt, unlocked; struct nfsclowner *owp; struct nfscldeleg *dp; struct nfsmount *nmp = VFSTONFS(dvp->v_mount); u_int32_t clidrev; if (NFSHASNFSV4(nmp)) { retrycnt = 0; do { dp = NULL; error = nfscl_open(dvp, NULL, 0, (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), 0, cred, p, &owp, NULL, &newone, NULL, 1, true); if (error) return (error); if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; else clidrev = 0; if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 || nfs_numnfscbd == 0 || retrycnt > 0) error = nfsrpc_createv4(dvp, name, namelen, vap, cverf, fmode, owp, &dp, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp, dstuff, &unlocked); else error = nfsrpc_getcreatelayout(dvp, name, namelen, vap, cverf, fmode, owp, &dp, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp, dstuff, &unlocked); /* * There is no need to invalidate cached attributes here, * since new post-delegation issue attributes are always * returned by nfsrpc_createv4() and these will update the * attribute cache. */ if (dp != NULL) (void) nfscl_deleg(nmp->nm_mountp, owp->nfsow_clp, (*nfhpp)->nfh_fh, (*nfhpp)->nfh_len, cred, p, &dp); nfscl_ownerrelease(nmp, owp, error, newone, unlocked); if (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_open"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); retrycnt++; } } while (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; } else { error = nfsrpc_createv23(dvp, name, namelen, vap, cverf, fmode, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp, dstuff); } return (error); } /* * The create rpc for v2 and 3. */ static int nfsrpc_createv23(vnode_t dvp, char *name, int namelen, struct vattr *vap, nfsquad_t cverf, int fmode, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff) { u_int32_t *tl; int error = 0; struct nfsrv_descript nfsd, *nd = &nfsd; *nfhpp = NULL; *attrflagp = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_CREATE, dvp); (void) nfsm_strtom(nd, name, namelen); if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (fmode & O_EXCL) { *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; } else { *tl = txdr_unsigned(NFSCREATE_UNCHECKED); nfscl_fillsattr(nd, vap, dvp, 0, 0); } } else { nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZE0, 0); } error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_repstat == 0) { error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); if (error) goto nfsmout; } if (nd->nd_flag & ND_NFSV3) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } static int nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap, nfsquad_t cverf, int fmode, struct nfsclowner *owp, struct nfscldeleg **dpp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff, int *unlockedp) { u_int32_t *tl; int error = 0, deleg, newone, ret, acesize, limitby; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsclopen *op; struct nfscldeleg *dp = NULL; struct nfsnode *np; struct nfsfh *nfhp; nfsattrbit_t attrbits; nfsv4stateid_t stateid; u_int32_t rflags; struct nfsmount *nmp; struct nfsclsession *tsep; nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); *unlockedp = 0; *nfhpp = NULL; *dpp = NULL; *attrflagp = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_CREATE, dvp); /* * For V4, this is actually an Open op. */ NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(owp->nfsow_seqid); *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; (void) nfsm_strtom(nd, owp->nfsow_owner, NFSV4CL_LOCKNAMELEN); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_CREATE); if (fmode & O_EXCL) { if (NFSHASNFSV4N(nmp)) { if (NFSHASSESSPERSIST(nmp)) { /* Use GUARDED for persistent sessions. */ *tl = txdr_unsigned(NFSCREATE_GUARDED); nfscl_fillsattr(nd, vap, dvp, 0, 0); } else { /* Otherwise, use EXCLUSIVE4_1. */ *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; nfscl_fillsattr(nd, vap, dvp, 0, 0); } } else { /* NFSv4.0 */ *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; } } else { *tl = txdr_unsigned(NFSCREATE_UNCHECKED); nfscl_fillsattr(nd, vap, dvp, 0, 0); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); (void) nfsm_strtom(nd, name, namelen); /* Get the new file's handle and attributes. */ NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); /* Get the directory's post-op attributes. */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); (void) nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); NFSCL_INCRSEQID(owp->nfsow_seqid, nd); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); stateid.seqid = *tl++; stateid.other[0] = *tl++; stateid.other[1] = *tl++; stateid.other[2] = *tl; rflags = fxdr_unsigned(u_int32_t, *(tl + 6)); (void) nfsrv_getattrbits(nd, &attrbits, NULL, NULL); NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); deleg = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEREAD || deleg == NFSV4OPEN_DELEGATEWRITE) { if (!(owp->nfsow_clp->nfsc_flags & NFSCLFLAGS_FIRSTDELEG)) owp->nfsow_clp->nfsc_flags |= (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG); dp = malloc( sizeof (struct nfscldeleg) + NFSX_V4FHMAX, M_NFSCLDELEG, M_WAITOK); LIST_INIT(&dp->nfsdl_owner); LIST_INIT(&dp->nfsdl_lock); dp->nfsdl_clp = owp->nfsow_clp; newnfs_copyincred(cred, &dp->nfsdl_cred); nfscl_lockinit(&dp->nfsdl_rwlock); NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); dp->nfsdl_stateid.seqid = *tl++; dp->nfsdl_stateid.other[0] = *tl++; dp->nfsdl_stateid.other[1] = *tl++; dp->nfsdl_stateid.other[2] = *tl++; ret = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEWRITE) { dp->nfsdl_flags = NFSCLDL_WRITE; /* * Indicates how much the file can grow. */ NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); limitby = fxdr_unsigned(int, *tl++); switch (limitby) { case NFSV4OPEN_LIMITSIZE: dp->nfsdl_sizelimit = fxdr_hyper(tl); break; case NFSV4OPEN_LIMITBLOCKS: dp->nfsdl_sizelimit = fxdr_unsigned(u_int64_t, *tl++); dp->nfsdl_sizelimit *= fxdr_unsigned(u_int64_t, *tl); break; default: error = NFSERR_BADXDR; goto nfsmout; } } else { dp->nfsdl_flags = NFSCLDL_READ; } if (ret) dp->nfsdl_flags |= NFSCLDL_RECALL; error = nfsrv_dissectace(nd, &dp->nfsdl_ace, &ret, &acesize, p); if (error) goto nfsmout; } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; } error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); if (error) goto nfsmout; /* Get rid of the PutFH and Getattr status values. */ NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); /* Load the directory attributes. */ error = nfsm_loadattr(nd, dnap); if (error) goto nfsmout; *dattrflagp = 1; if (dp != NULL && *attrflagp) { dp->nfsdl_change = nnap->na_filerev; dp->nfsdl_modtime = nnap->na_mtime; dp->nfsdl_flags |= NFSCLDL_MODTIMESET; } /* * We can now complete the Open state. */ nfhp = *nfhpp; if (dp != NULL) { dp->nfsdl_fhlen = nfhp->nfh_len; NFSBCOPY(nfhp->nfh_fh, dp->nfsdl_fh, nfhp->nfh_len); } /* * Get an Open structure that will be * attached to the OpenOwner, acquired already. */ error = nfscl_open(dvp, nfhp->nfh_fh, nfhp->nfh_len, (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), 0, cred, p, NULL, &op, &newone, NULL, 0, false); if (error) goto nfsmout; op->nfso_stateid = stateid; newnfs_copyincred(cred, &op->nfso_cred); if ((rflags & NFSV4OPEN_RESULTCONFIRM)) { do { ret = nfsrpc_openconfirm(dvp, nfhp->nfh_fh, nfhp->nfh_len, op, cred, p); if (ret == NFSERR_DELAY) (void) nfs_catnap(PZERO, ret, "nfs_create"); } while (ret == NFSERR_DELAY); error = ret; } /* * If the server is handing out delegations, but we didn't * get one because an OpenConfirm was required, try the * Open again, to get a delegation. This is a harmless no-op, * from a server's point of view. */ if ((rflags & NFSV4OPEN_RESULTCONFIRM) && (owp->nfsow_clp->nfsc_flags & NFSCLFLAGS_GOTDELEG) && !error && dp == NULL) { do { ret = nfsrpc_openrpc(VFSTONFS(dvp->v_mount), dvp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, nfhp->nfh_fh, nfhp->nfh_len, (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), op, name, namelen, &dp, 0, 0x0, cred, p, 0, 1); if (ret == NFSERR_DELAY) (void) nfs_catnap(PZERO, ret, "nfs_crt2"); } while (ret == NFSERR_DELAY); if (ret) { if (dp != NULL) { free(dp, M_NFSCLDELEG); dp = NULL; } if (ret == NFSERR_STALECLIENTID || ret == NFSERR_STALEDONTRECOVER || ret == NFSERR_BADSESSION) error = ret; } } nfscl_openrelease(nmp, op, error, newone); *unlockedp = 1; } if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; if (error == NFSERR_STALECLIENTID) nfscl_initiate_recovery(owp->nfsow_clp); nfsmout: if (!error) *dpp = dp; else if (dp != NULL) free(dp, M_NFSCLDELEG); m_freem(nd->nd_mrep); return (error); } /* * Nfs remove rpc */ int nfsrpc_remove(vnode_t dvp, char *name, int namelen, vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, int *dattrflagp, void *dstuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsnode *np; struct nfsmount *nmp; nfsv4stateid_t dstateid; int error, ret = 0, i; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); nmp = VFSTONFS(dvp->v_mount); tryagain: if (NFSHASNFSV4(nmp) && ret == 0) { ret = nfscl_removedeleg(vp, p, &dstateid); if (ret == 1) { NFSCL_REQSTART(nd, NFSPROC_RETDELEGREMOVE, vp); NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = dstateid.seqid; *tl++ = dstateid.other[0]; *tl++ = dstateid.other[1]; *tl++ = dstateid.other[2]; *tl = txdr_unsigned(NFSV4OP_PUTFH); np = VTONFS(dvp); (void) nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_REMOVE); } } else { ret = 0; } if (ret == 0) NFSCL_REQSTART(nd, NFSPROC_REMOVE, dvp); (void) nfsm_strtom(nd, name, namelen); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { /* For NFSv4, parse out any Delereturn replies. */ if (ret > 0 && nd->nd_repstat != 0 && (nd->nd_flag & ND_NOMOREDATA)) { /* * If the Delegreturn failed, try again without * it. The server will Recall, as required. */ m_freem(nd->nd_mrep); goto tryagain; } for (i = 0; i < (ret * 2); i++) { if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) nd->nd_flag |= ND_NOMOREDATA; } } error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); } if (nd->nd_repstat && !error) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Do an nfs rename rpc. */ int nfsrpc_rename(vnode_t fdvp, vnode_t fvp, char *fnameptr, int fnamelen, vnode_t tdvp, vnode_t tvp, char *tnameptr, int tnamelen, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *fnap, struct nfsvattr *tnap, int *fattrflagp, int *tattrflagp, void *fstuff, void *tstuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; struct nfsnode *np; nfsattrbit_t attrbits; nfsv4stateid_t fdstateid, tdstateid; int error = 0, ret = 0, gottd = 0, gotfd = 0, i; *fattrflagp = 0; *tattrflagp = 0; nmp = VFSTONFS(fdvp->v_mount); if (fnamelen > NFS_MAXNAMLEN || tnamelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); tryagain: if (NFSHASNFSV4(nmp) && ret == 0) { ret = nfscl_renamedeleg(fvp, &fdstateid, &gotfd, tvp, &tdstateid, &gottd, p); if (gotfd && gottd) { NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME2, fvp); } else if (gotfd) { NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME1, fvp); } else if (gottd) { NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME1, tvp); } if (gotfd) { NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = fdstateid.seqid; *tl++ = fdstateid.other[0]; *tl++ = fdstateid.other[1]; *tl = fdstateid.other[2]; if (gottd) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); np = VTONFS(tvp); (void) nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_DELEGRETURN); } } if (gottd) { NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = tdstateid.seqid; *tl++ = tdstateid.other[0]; *tl++ = tdstateid.other[1]; *tl = tdstateid.other[2]; } if (ret > 0) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); np = VTONFS(fdvp); (void) nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_SAVEFH); } } else { ret = 0; } if (ret == 0) NFSCL_REQSTART(nd, NFSPROC_RENAME, fdvp); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSWCCATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); (void) nfsm_fhtom(nd, VTONFS(tdvp)->n_fhp->nfh_fh, VTONFS(tdvp)->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); nd->nd_flag |= ND_V4WCCATTR; NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_RENAME); } (void) nfsm_strtom(nd, fnameptr, fnamelen); if (!(nd->nd_flag & ND_NFSV4)) (void) nfsm_fhtom(nd, VTONFS(tdvp)->n_fhp->nfh_fh, VTONFS(tdvp)->n_fhp->nfh_len, 0); (void) nfsm_strtom(nd, tnameptr, tnamelen); error = nfscl_request(nd, fdvp, p, cred, fstuff); if (error) return (error); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { /* For NFSv4, parse out any Delereturn replies. */ if (ret > 0 && nd->nd_repstat != 0 && (nd->nd_flag & ND_NOMOREDATA)) { /* * If the Delegreturn failed, try again without * it. The server will Recall, as required. */ m_freem(nd->nd_mrep); goto tryagain; } for (i = 0; i < (ret * 2); i++) { if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) { if (i == 0 && ret > 1) { /* * If the Delegreturn failed, try again * without it. The server will Recall, as * required. * If ret > 1, the first iteration of this * loop is the second DelegReturn result. */ m_freem(nd->nd_mrep); goto tryagain; } else { nd->nd_flag |= ND_NOMOREDATA; } } } } /* Now, the first wcc attribute reply. */ if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) nd->nd_flag |= ND_NOMOREDATA; } error = nfscl_wcc_data(nd, fdvp, fnap, fattrflagp, NULL, fstuff); /* and the second wcc attribute reply. */ if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4 && !error) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) nd->nd_flag |= ND_NOMOREDATA; } if (!error) error = nfscl_wcc_data(nd, tdvp, tnap, tattrflagp, NULL, tstuff); } if (nd->nd_repstat && !error) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs hard link create rpc */ int nfsrpc_link(vnode_t dvp, vnode_t vp, char *name, int namelen, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nap, int *attrflagp, int *dattrflagp, void *dstuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t attrbits; int error = 0; *attrflagp = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_LINK, vp); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); } (void) nfsm_fhtom(nd, VTONFS(dvp)->n_fhp->nfh_fh, VTONFS(dvp)->n_fhp->nfh_len, 0); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSWCCATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); nd->nd_flag |= ND_V4WCCATTR; NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_LINK); } (void) nfsm_strtom(nd, name, namelen); error = nfscl_request(nd, vp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) { error = nfscl_postop_attr(nd, nap, attrflagp, dstuff); if (!error) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); } else if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { /* * First, parse out the PutFH and Getattr result. */ NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (!(*(tl + 1))) NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) nd->nd_flag |= ND_NOMOREDATA; /* * Get the pre-op attributes. */ error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); } if (nd->nd_repstat && !error) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs symbolic link create rpc */ int nfsrpc_symlink(vnode_t dvp, char *name, int namelen, const char *target, struct vattr *vap, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; int slen, error = 0; *nfhpp = NULL; *attrflagp = 0; *dattrflagp = 0; nmp = VFSTONFS(dvp->v_mount); slen = strlen(target); if (slen > NFS_MAXPATHLEN || namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_SYMLINK, dvp); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFLNK); (void) nfsm_strtom(nd, target, slen); } (void) nfsm_strtom(nd, name, namelen); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) nfscl_fillsattr(nd, vap, dvp, 0, 0); if (!(nd->nd_flag & ND_NFSV4)) (void) nfsm_strtom(nd, target, slen); if (nd->nd_flag & ND_NFSV2) nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1, 0); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & ND_NFSV4) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if ((nd->nd_flag & ND_NFSV3) && !error) { if (!nd->nd_repstat) error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); if (!error) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); } if (nd->nd_repstat && !error) error = nd->nd_repstat; m_freem(nd->nd_mrep); /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. * Only do this if vfs.nfs.ignore_eexist is set. * Never do this for NFSv4.1 or later minor versions, since sessions * should guarantee "exactly once" RPC semantics. */ if (error == EEXIST && nfsignore_eexist != 0 && (!NFSHASNFSV4(nmp) || nmp->nm_minorvers == 0)) error = 0; return (error); } /* * nfs make dir rpc */ int nfsrpc_mkdir(vnode_t dvp, char *name, int namelen, struct vattr *vap, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t attrbits; int error = 0; struct nfsfh *fhp; struct nfsmount *nmp; *nfhpp = NULL; *attrflagp = 0; *dattrflagp = 0; nmp = VFSTONFS(dvp->v_mount); fhp = VTONFS(dvp)->n_fhp; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_MKDIR, dvp); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFDIR); } (void) nfsm_strtom(nd, name, namelen); nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1, 0); if (nd->nd_flag & ND_NFSV4) { NFSGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); (void) nfsm_fhtom(nd, fhp->nfh_fh, fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & ND_NFSV4) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (!nd->nd_repstat && !error) { if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); } if (!error) error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); if (error == 0 && (nd->nd_flag & ND_NFSV4) != 0) { /* Get rid of the PutFH and Getattr status values. */ NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); /* Load the directory attributes. */ error = nfsm_loadattr(nd, dnap); if (error == 0) *dattrflagp = 1; } } if ((nd->nd_flag & ND_NFSV3) && !error) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (nd->nd_repstat && !error) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. * Only do this if vfs.nfs.ignore_eexist is set. * Never do this for NFSv4.1 or later minor versions, since sessions * should guarantee "exactly once" RPC semantics. */ if (error == EEXIST && nfsignore_eexist != 0 && (!NFSHASNFSV4(nmp) || nmp->nm_minorvers == 0)) error = 0; return (error); } /* * nfs remove directory call */ int nfsrpc_rmdir(vnode_t dvp, char *name, int namelen, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, int *dattrflagp, void *dstuff) { struct nfsrv_descript nfsd, *nd = &nfsd; int error = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_RMDIR, dvp); (void) nfsm_strtom(nd, name, namelen); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (nd->nd_repstat && !error) error = nd->nd_repstat; m_freem(nd->nd_mrep); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * Readdir rpc. * Always returns with either uio_resid unchanged, if you are at the * end of the directory, or uio_resid == 0, with all DIRBLKSIZ chunks * filled in. * I felt this would allow caching of directory blocks more easily * than returning a pertially filled block. * Directory offset cookies: * Oh my, what to do with them... * I can think of three ways to deal with them: * 1 - have the layer above these RPCs maintain a map between logical * directory byte offsets and the NFS directory offset cookies * 2 - pass the opaque directory offset cookies up into userland * and let the libc functions deal with them, via the system call * 3 - return them to userland in the "struct dirent", so future versions * of libc can use them and do whatever is necessary to make things work * above these rpc calls, in the meantime * For now, I do #3 by "hiding" the directory offset cookies after the * d_name field in struct dirent. This is space inside d_reclen that * will be ignored by anything that doesn't know about them. * The directory offset cookies are filled in as the last 8 bytes of * each directory entry, after d_name. Someday, the userland libc * functions may be able to use these. In the meantime, it satisfies * OpenBSD's requirements for cookies being returned. * If expects the directory offset cookie for the read to be in uio_offset * and returns the one for the next entry after this directory block in * there, as well. */ int nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, int *eofp, void *stuff) { int len, left; struct dirent *dp = NULL; u_int32_t *tl; nfsquad_t cookie, ncookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp); struct nfsvattr nfsva; struct nfsrv_descript nfsd, *nd = &nfsd; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int reqsize, tryformoredirs = 1, readsize, eof = 0, gotmnton = 0; u_int64_t dotfileid, dotdotfileid = 0, fakefileno = UINT64_MAX; char *cp; nfsattrbit_t attrbits, dattrbits; u_int32_t rderr, *tl2 = NULL; size_t tresid; KASSERT(uiop->uio_iovcnt == 1 && (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0, ("nfs readdirrpc bad uio")); ncookie.lval[0] = ncookie.lval[1] = 0; /* * There is no point in reading a lot more than uio_resid, however * adding one additional DIRBLKSIZ makes sense. Since uio_resid * and nm_readdirsize are both exact multiples of DIRBLKSIZ, this * will never make readsize > nm_readdirsize. */ readsize = nmp->nm_readdirsize; if (readsize > uiop->uio_resid) readsize = uiop->uio_resid + DIRBLKSIZ; *attrflagp = 0; if (eofp) *eofp = 0; tresid = uiop->uio_resid; cookie.lval[0] = cookiep->nfsuquad[0]; cookie.lval[1] = cookiep->nfsuquad[1]; nd->nd_mrep = NULL; /* * For NFSv4, first create the "." and ".." entries. */ if (NFSHASNFSV4(nmp)) { reqsize = 6 * NFSX_UNSIGNED; NFSGETATTR_ATTRBIT(&dattrbits); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FILEID); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TYPE); if (NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, NFSATTRBIT_MOUNTEDONFILEID)) { NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MOUNTEDONFILEID); gotmnton = 1; } else { /* * Must fake it. Use the fileno, except when the * fsid is != to that of the directory. For that * case, generate a fake fileno that is not the same. */ NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FSID); gotmnton = 0; } /* * Joy, oh joy. For V4 we get to hand craft '.' and '..'. */ if (uiop->uio_offset == 0) { NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, vp); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); dotfileid = 0; /* Fake out the compiler. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) { error = nfsm_loadattr(nd, &nfsva); if (error != 0) goto nfsmout; dotfileid = nfsva.na_fileid; } if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); len = fxdr_unsigned(int, *(tl + 4)); if (len > 0 && len <= NFSX_V4FHMAX) error = nfsm_advance(nd, NFSM_RNDUP(len), -1); else error = EPERM; if (!error) { NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED); nfsva.na_mntonfileno = UINT64_MAX; error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, cred); if (error) { dotdotfileid = dotfileid; } else if (gotmnton) { if (nfsva.na_mntonfileno != UINT64_MAX) dotdotfileid = nfsva.na_mntonfileno; else dotdotfileid = nfsva.na_fileid; } else if (nfsva.na_filesid[0] == dnp->n_vattr.na_filesid[0] && nfsva.na_filesid[1] == dnp->n_vattr.na_filesid[1]) { dotdotfileid = nfsva.na_fileid; } else { do { fakefileno--; } while (fakefileno == nfsva.na_fileid); dotdotfileid = fakefileno; } } } else if (nd->nd_repstat == NFSERR_NOENT) { /* * Lookupp returns NFSERR_NOENT when we are * at the root, so just use the current dir. */ nd->nd_repstat = 0; dotdotfileid = dotfileid; } else { error = nd->nd_repstat; } m_freem(nd->nd_mrep); if (error) return (error); nd->nd_mrep = NULL; dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_type = DT_DIR; dp->d_fileno = dotfileid; dp->d_namlen = 1; *((uint64_t *)dp->d_name) = 0; /* Zero pad it. */ dp->d_name[0] = '.'; dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER; /* * Just make these offset cookie 0. */ tl = (u_int32_t *)&dp->d_name[8]; *tl++ = 0; *tl = 0; blksiz += dp->d_reclen; uiop->uio_resid -= dp->d_reclen; uiop->uio_offset += dp->d_reclen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + dp->d_reclen; uiop->uio_iov->iov_len -= dp->d_reclen; dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_type = DT_DIR; dp->d_fileno = dotdotfileid; dp->d_namlen = 2; *((uint64_t *)dp->d_name) = 0; dp->d_name[0] = '.'; dp->d_name[1] = '.'; dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER; /* * Just make these offset cookie 0. */ tl = (u_int32_t *)&dp->d_name[8]; *tl++ = 0; *tl = 0; blksiz += dp->d_reclen; uiop->uio_resid -= dp->d_reclen; uiop->uio_offset += dp->d_reclen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + dp->d_reclen; uiop->uio_iov->iov_len -= dp->d_reclen; } NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_RDATTRERROR); } else { reqsize = 5 * NFSX_UNSIGNED; } /* * Loop around doing readdir rpc's of size readsize. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_READDIR, vp); if (nd->nd_flag & ND_NFSV2) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = cookie.lval[1]; *tl = txdr_unsigned(readsize); } else { NFSM_BUILD(tl, u_int32_t *, reqsize); *tl++ = cookie.lval[0]; *tl++ = cookie.lval[1]; if (cookie.qval == 0) { *tl++ = 0; *tl++ = 0; } else { NFSLOCKNODE(dnp); *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; NFSUNLOCKNODE(dnp); } if (nd->nd_flag & ND_NFSV4) { *tl++ = txdr_unsigned(readsize); *tl = txdr_unsigned(readsize); (void) nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &dattrbits); } else { *tl = txdr_unsigned(readsize); } } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (!(nd->nd_flag & ND_NFSV2)) { if (nd->nd_flag & ND_NFSV3) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (!nd->nd_repstat && !error) { NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); NFSLOCKNODE(dnp); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl; NFSUNLOCKNODE(dnp); } } if (nd->nd_repstat || error) { if (!error) error = nd->nd_repstat; goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); if (!more_dirs) tryformoredirs = 0; /* loop through the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED); ncookie.lval[0] = *tl++; ncookie.lval[1] = *tl++; len = fxdr_unsigned(int, *tl); } else if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED); nfsva.na_fileid = fxdr_hyper(tl); tl += 2; len = fxdr_unsigned(int, *tl); } else { NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED); nfsva.na_fileid = fxdr_unsigned(uint64_t, *tl++); len = fxdr_unsigned(int, *tl); } if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; goto nfsmout; } tlen = roundup2(len, 8); if (tlen == len) tlen += 8; /* To ensure null termination. */ left = DIRBLKSIZ - blksiz; if (_GENERIC_DIRLEN(len) + NFSX_HYPER > left) { NFSBZERO(uiop->uio_iov->iov_base, left); dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_resid -= left; uiop->uio_offset += left; blksiz = 0; } if (_GENERIC_DIRLEN(len) + NFSX_HYPER > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_namlen = len; dp->d_reclen = _GENERIC_DIRLEN(len) + NFSX_HYPER; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_resid -= DIRHDSIZ; uiop->uio_offset += DIRHDSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; error = nfsm_mbufuio(nd, uiop, len); if (error) goto nfsmout; cp = uiop->uio_iov->iov_base; tlen -= len; NFSBZERO(cp, tlen); cp += tlen; /* points to cookie storage */ tl2 = (u_int32_t *)cp; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + tlen + NFSX_HYPER; uiop->uio_iov->iov_len -= tlen + NFSX_HYPER; uiop->uio_resid -= tlen + NFSX_HYPER; uiop->uio_offset += (tlen + NFSX_HYPER); } else { error = nfsm_advance(nd, NFSM_RNDUP(len), -1); if (error) goto nfsmout; } if (nd->nd_flag & ND_NFSV4) { rderr = 0; nfsva.na_mntonfileno = UINT64_MAX; error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, &rderr, p, cred); if (error) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); } else if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED); ncookie.lval[0] = *tl++; ncookie.lval[1] = *tl++; } else { NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED); ncookie.lval[0] = 0; ncookie.lval[1] = *tl++; } if (bigenough) { if (nd->nd_flag & ND_NFSV4) { if (rderr) { dp->d_fileno = 0; } else { if (gotmnton) { if (nfsva.na_mntonfileno != UINT64_MAX) dp->d_fileno = nfsva.na_mntonfileno; else dp->d_fileno = nfsva.na_fileid; } else if (nfsva.na_filesid[0] == dnp->n_vattr.na_filesid[0] && nfsva.na_filesid[1] == dnp->n_vattr.na_filesid[1]) { dp->d_fileno = nfsva.na_fileid; } else { do { fakefileno--; } while (fakefileno == nfsva.na_fileid); dp->d_fileno = fakefileno; } dp->d_type = vtonfs_dtype(nfsva.na_type); } } else { dp->d_fileno = nfsva.na_fileid; } *tl2++ = cookiep->nfsuquad[0] = cookie.lval[0] = ncookie.lval[0]; *tl2 = cookiep->nfsuquad[1] = cookie.lval[1] = ncookie.lval[1]; } more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); eof = fxdr_unsigned(int, *tl); if (tryformoredirs) more_dirs = !eof; if (nd->nd_flag & ND_NFSV4) { error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (error) goto nfsmout; } } m_freem(nd->nd_mrep); nd->nd_mrep = NULL; } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; NFSBZERO(uiop->uio_iov->iov_base, left); dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_resid -= left; uiop->uio_offset += left; } /* * If returning no data, assume end of file. * If not bigenough, return not end of file, since you aren't * returning all the data * Otherwise, return the eof flag from the server. */ if (eofp) { if (tresid == ((size_t)(uiop->uio_resid))) *eofp = 1; else if (!bigenough) *eofp = 0; else *eofp = eof; } /* * Add extra empty records to any remaining DIRBLKSIZ chunks. */ while (uiop->uio_resid > 0 && uiop->uio_resid != tresid) { dp = (struct dirent *)uiop->uio_iov->iov_base; NFSBZERO(dp, DIRBLKSIZ); dp->d_type = DT_UNKNOWN; tl = (u_int32_t *)&dp->d_name[4]; *tl++ = cookie.lval[0]; *tl = cookie.lval[1]; dp->d_reclen = DIRBLKSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRBLKSIZ; uiop->uio_iov->iov_len -= DIRBLKSIZ; uiop->uio_resid -= DIRBLKSIZ; uiop->uio_offset += DIRBLKSIZ; } nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); return (error); } #ifndef APPLE /* * NFS V3 readdir plus RPC. Used in place of nfsrpc_readdir(). * (Also used for NFS V4 when mount flag set.) * (ditto above w.r.t. multiple of DIRBLKSIZ, etc.) */ int nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, int *eofp, void *stuff) { int len, left; struct dirent *dp = NULL; u_int32_t *tl; vnode_t newvp = NULLVP; struct nfsrv_descript nfsd, *nd = &nfsd; struct nameidata nami, *ndp = &nami; struct componentname *cnp = &ndp->ni_cnd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp), *np; struct nfsvattr nfsva; struct nfsfh *nfhp; nfsquad_t cookie, ncookie; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int attrflag, tryformoredirs = 1, eof = 0, gotmnton = 0; int isdotdot = 0, unlocknewvp = 0; u_int64_t dotfileid, dotdotfileid = 0, fakefileno = UINT64_MAX; u_int64_t fileno = 0; char *cp; nfsattrbit_t attrbits, dattrbits; size_t tresid; u_int32_t *tl2 = NULL, rderr; struct timespec dctime; KASSERT(uiop->uio_iovcnt == 1 && (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0, ("nfs readdirplusrpc bad uio")); ncookie.lval[0] = ncookie.lval[1] = 0; timespecclear(&dctime); *attrflagp = 0; if (eofp != NULL) *eofp = 0; ndp->ni_dvp = vp; nd->nd_mrep = NULL; cookie.lval[0] = cookiep->nfsuquad[0]; cookie.lval[1] = cookiep->nfsuquad[1]; tresid = uiop->uio_resid; /* * For NFSv4, first create the "." and ".." entries. */ if (NFSHASNFSV4(nmp)) { NFSGETATTR_ATTRBIT(&dattrbits); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FILEID); if (NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, NFSATTRBIT_MOUNTEDONFILEID)) { NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MOUNTEDONFILEID); gotmnton = 1; } else { /* * Must fake it. Use the fileno, except when the * fsid is != to that of the directory. For that * case, generate a fake fileno that is not the same. */ NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FSID); gotmnton = 0; } /* * Joy, oh joy. For V4 we get to hand craft '.' and '..'. */ if (uiop->uio_offset == 0) { NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, vp); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); dotfileid = 0; /* Fake out the compiler. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) { error = nfsm_loadattr(nd, &nfsva); if (error != 0) goto nfsmout; dctime = nfsva.na_ctime; dotfileid = nfsva.na_fileid; } if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); len = fxdr_unsigned(int, *(tl + 4)); if (len > 0 && len <= NFSX_V4FHMAX) error = nfsm_advance(nd, NFSM_RNDUP(len), -1); else error = EPERM; if (!error) { NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED); nfsva.na_mntonfileno = UINT64_MAX; error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, cred); if (error) { dotdotfileid = dotfileid; } else if (gotmnton) { if (nfsva.na_mntonfileno != UINT64_MAX) dotdotfileid = nfsva.na_mntonfileno; else dotdotfileid = nfsva.na_fileid; } else if (nfsva.na_filesid[0] == dnp->n_vattr.na_filesid[0] && nfsva.na_filesid[1] == dnp->n_vattr.na_filesid[1]) { dotdotfileid = nfsva.na_fileid; } else { do { fakefileno--; } while (fakefileno == nfsva.na_fileid); dotdotfileid = fakefileno; } } } else if (nd->nd_repstat == NFSERR_NOENT) { /* * Lookupp returns NFSERR_NOENT when we are * at the root, so just use the current dir. */ nd->nd_repstat = 0; dotdotfileid = dotfileid; } else { error = nd->nd_repstat; } m_freem(nd->nd_mrep); if (error) return (error); nd->nd_mrep = NULL; dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_type = DT_DIR; dp->d_fileno = dotfileid; dp->d_namlen = 1; *((uint64_t *)dp->d_name) = 0; /* Zero pad it. */ dp->d_name[0] = '.'; dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER; /* * Just make these offset cookie 0. */ tl = (u_int32_t *)&dp->d_name[8]; *tl++ = 0; *tl = 0; blksiz += dp->d_reclen; uiop->uio_resid -= dp->d_reclen; uiop->uio_offset += dp->d_reclen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + dp->d_reclen; uiop->uio_iov->iov_len -= dp->d_reclen; dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_type = DT_DIR; dp->d_fileno = dotdotfileid; dp->d_namlen = 2; *((uint64_t *)dp->d_name) = 0; dp->d_name[0] = '.'; dp->d_name[1] = '.'; dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER; /* * Just make these offset cookie 0. */ tl = (u_int32_t *)&dp->d_name[8]; *tl++ = 0; *tl = 0; blksiz += dp->d_reclen; uiop->uio_resid -= dp->d_reclen; uiop->uio_offset += dp->d_reclen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + dp->d_reclen; uiop->uio_iov->iov_len -= dp->d_reclen; } NFSREADDIRPLUS_ATTRBIT(&attrbits); if (gotmnton) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MOUNTEDONFILEID); if (!NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, NFSATTRBIT_TIMECREATE)) NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE); } /* * Loop around doing readdir rpc's of size nm_readdirsize. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_READDIRPLUS, vp); NFSM_BUILD(tl, u_int32_t *, 6 * NFSX_UNSIGNED); *tl++ = cookie.lval[0]; *tl++ = cookie.lval[1]; if (cookie.qval == 0) { *tl++ = 0; *tl++ = 0; } else { NFSLOCKNODE(dnp); *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; NFSUNLOCKNODE(dnp); } *tl++ = txdr_unsigned(nmp->nm_readdirsize); *tl = txdr_unsigned(nmp->nm_readdirsize); if (nd->nd_flag & ND_NFSV4) { (void) nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &dattrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (nd->nd_repstat || error) { if (!error) error = nd->nd_repstat; goto nfsmout; } if ((nd->nd_flag & ND_NFSV3) != 0 && *attrflagp != 0) dctime = nap->na_ctime; NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); NFSLOCKNODE(dnp); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl++; NFSUNLOCKNODE(dnp); more_dirs = fxdr_unsigned(int, *tl); if (!more_dirs) tryformoredirs = 0; /* loop through the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); if (nd->nd_flag & ND_NFSV4) { ncookie.lval[0] = *tl++; ncookie.lval[1] = *tl++; } else { fileno = fxdr_hyper(tl); tl += 2; } len = fxdr_unsigned(int, *tl); if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; goto nfsmout; } tlen = roundup2(len, 8); if (tlen == len) tlen += 8; /* To ensure null termination. */ left = DIRBLKSIZ - blksiz; if (_GENERIC_DIRLEN(len) + NFSX_HYPER > left) { NFSBZERO(uiop->uio_iov->iov_base, left); dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_resid -= left; uiop->uio_offset += left; blksiz = 0; } if (_GENERIC_DIRLEN(len) + NFSX_HYPER > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_namlen = len; dp->d_reclen = _GENERIC_DIRLEN(len) + NFSX_HYPER; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_resid -= DIRHDSIZ; uiop->uio_offset += DIRHDSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; cnp->cn_nameptr = uiop->uio_iov->iov_base; cnp->cn_namelen = len; NFSCNHASHZERO(cnp); error = nfsm_mbufuio(nd, uiop, len); if (error) goto nfsmout; cp = uiop->uio_iov->iov_base; tlen -= len; NFSBZERO(cp, tlen); cp += tlen; /* points to cookie storage */ tl2 = (u_int32_t *)cp; if (len == 2 && cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') isdotdot = 1; else isdotdot = 0; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + tlen + NFSX_HYPER; uiop->uio_iov->iov_len -= tlen + NFSX_HYPER; uiop->uio_resid -= tlen + NFSX_HYPER; uiop->uio_offset += (tlen + NFSX_HYPER); } else { error = nfsm_advance(nd, NFSM_RNDUP(len), -1); if (error) goto nfsmout; } nfhp = NULL; if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED); ncookie.lval[0] = *tl++; ncookie.lval[1] = *tl++; attrflag = fxdr_unsigned(int, *tl); if (attrflag) { error = nfsm_loadattr(nd, &nfsva); if (error) goto nfsmout; } NFSM_DISSECT(tl,u_int32_t *,NFSX_UNSIGNED); if (*tl) { error = nfsm_getfh(nd, &nfhp); if (error) goto nfsmout; } if (!attrflag && nfhp != NULL) { free(nfhp, M_NFSFH); nfhp = NULL; } } else { rderr = 0; nfsva.na_mntonfileno = 0xffffffff; error = nfsv4_loadattr(nd, NULL, &nfsva, &nfhp, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, &rderr, p, cred); if (error) goto nfsmout; } if (bigenough) { if (nd->nd_flag & ND_NFSV4) { if (rderr) { dp->d_fileno = 0; } else if (gotmnton) { if (nfsva.na_mntonfileno != 0xffffffff) dp->d_fileno = nfsva.na_mntonfileno; else dp->d_fileno = nfsva.na_fileid; } else if (nfsva.na_filesid[0] == dnp->n_vattr.na_filesid[0] && nfsva.na_filesid[1] == dnp->n_vattr.na_filesid[1]) { dp->d_fileno = nfsva.na_fileid; } else { do { fakefileno--; } while (fakefileno == nfsva.na_fileid); dp->d_fileno = fakefileno; } } else { dp->d_fileno = fileno; } *tl2++ = cookiep->nfsuquad[0] = cookie.lval[0] = ncookie.lval[0]; *tl2 = cookiep->nfsuquad[1] = cookie.lval[1] = ncookie.lval[1]; if (nfhp != NULL) { if (NFSRV_CMPFH(nfhp->nfh_fh, nfhp->nfh_len, dnp->n_fhp->nfh_fh, dnp->n_fhp->nfh_len)) { VREF(vp); newvp = vp; unlocknewvp = 0; free(nfhp, M_NFSFH); np = dnp; } else if (isdotdot != 0) { /* * Skip doing a nfscl_nget() call for "..". * There's a race between acquiring the nfs * node here and lookups that look for the * directory being read (in the parent). * It would try to get a lock on ".." here, * owning the lock on the directory being * read. Lookup will hold the lock on ".." * and try to acquire the lock on the * directory being read. * If the directory is unlocked/relocked, * then there is a LOR with the buflock * vp is relocked. */ free(nfhp, M_NFSFH); } else { error = nfscl_nget(vp->v_mount, vp, nfhp, cnp, p, &np, NULL, LK_EXCLUSIVE); if (!error) { newvp = NFSTOV(np); unlocknewvp = 1; } } nfhp = NULL; if (newvp != NULLVP) { error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL, 0, 0); if (error) { if (unlocknewvp) vput(newvp); else vrele(newvp); goto nfsmout; } dp->d_type = vtonfs_dtype(np->n_vattr.na_type); ndp->ni_vp = newvp; NFSCNHASH(cnp, HASHINIT); if (cnp->cn_namelen <= NCHNAMLEN && ndp->ni_dvp != ndp->ni_vp && (newvp->v_type != VDIR || dctime.tv_sec != 0)) { cache_enter_time_flags(ndp->ni_dvp, ndp->ni_vp, cnp, &nfsva.na_ctime, newvp->v_type != VDIR ? NULL : &dctime, VFS_CACHE_DROPOLD); } if (unlocknewvp) vput(newvp); else vrele(newvp); newvp = NULLVP; } } } else if (nfhp != NULL) { free(nfhp, M_NFSFH); } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); eof = fxdr_unsigned(int, *tl); if (tryformoredirs) more_dirs = !eof; if (nd->nd_flag & ND_NFSV4) { error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (error) goto nfsmout; } } m_freem(nd->nd_mrep); nd->nd_mrep = NULL; } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; NFSBZERO(uiop->uio_iov->iov_base, left); dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_resid -= left; uiop->uio_offset += left; } /* * If returning no data, assume end of file. * If not bigenough, return not end of file, since you aren't * returning all the data * Otherwise, return the eof flag from the server. */ if (eofp != NULL) { if (tresid == uiop->uio_resid) *eofp = 1; else if (!bigenough) *eofp = 0; else *eofp = eof; } /* * Add extra empty records to any remaining DIRBLKSIZ chunks. */ while (uiop->uio_resid > 0 && uiop->uio_resid != tresid) { dp = (struct dirent *)uiop->uio_iov->iov_base; NFSBZERO(dp, DIRBLKSIZ); dp->d_type = DT_UNKNOWN; tl = (u_int32_t *)&dp->d_name[4]; *tl++ = cookie.lval[0]; *tl = cookie.lval[1]; dp->d_reclen = DIRBLKSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRBLKSIZ; uiop->uio_iov->iov_len -= DIRBLKSIZ; uiop->uio_resid -= DIRBLKSIZ; uiop->uio_offset += DIRBLKSIZ; } nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); return (error); } #endif /* !APPLE */ /* * Nfs commit rpc */ int nfsrpc_commit(vnode_t vp, u_quad_t offset, int cnt, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t attrbits; int error; struct nfsmount *nmp = VFSTONFS(vp->v_mount); *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_COMMIT, vp); NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); if (nd->nd_flag & ND_NFSV4) { /* * And do a Getattr op. */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); error = nfscl_wcc_data(nd, vp, nap, attrflagp, NULL, stuff); if (!error && !nd->nd_repstat) { NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF); NFSLOCKMNT(nmp); if (NFSBCMP(nmp->nm_verf, tl, NFSX_VERF)) { NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); nd->nd_repstat = NFSERR_STALEWRITEVERF; } NFSUNLOCKMNT(nmp); if (nd->nd_flag & ND_NFSV4) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); } nfsmout: if (!error && nd->nd_repstat) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * NFS byte range lock rpc. * (Mostly just calls one of the three lower level RPC routines.) */ int nfsrpc_advlock(vnode_t vp, off_t size, int op, struct flock *fl, int reclaim, struct ucred *cred, NFSPROC_T *p, void *id, int flags) { struct nfscllockowner *lp; struct nfsclclient *clp; struct nfsfh *nfhp; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); u_int64_t off, len; off_t start, end; u_int32_t clidrev = 0; int error = 0, newone = 0, expireret = 0, retrycnt, donelocally; int callcnt, dorpc; /* * Convert the flock structure into a start and end and do POSIX * bounds checking. */ switch (fl->l_whence) { case SEEK_SET: case SEEK_CUR: /* * Caller is responsible for adding any necessary offset * when SEEK_CUR is used. */ start = fl->l_start; off = fl->l_start; break; case SEEK_END: start = size + fl->l_start; off = size + fl->l_start; break; default: return (EINVAL); } if (start < 0) return (EINVAL); if (fl->l_len != 0) { end = start + fl->l_len - 1; if (end < start) return (EINVAL); } len = fl->l_len; if (len == 0) len = NFS64BITSSET; retrycnt = 0; do { nd->nd_repstat = 0; if (op == F_GETLK) { error = nfscl_getcl(vp->v_mount, cred, p, false, true, &clp); if (error) return (error); error = nfscl_lockt(vp, clp, off, len, fl, p, id, flags); if (!error) { clidrev = clp->nfsc_clientidrev; error = nfsrpc_lockt(nd, vp, clp, off, len, fl, cred, p, id, flags); } else if (error == -1) { error = 0; } nfscl_clientrelease(clp); } else if (op == F_UNLCK && fl->l_type == F_UNLCK) { /* * We must loop around for all lockowner cases. */ callcnt = 0; error = nfscl_getcl(vp->v_mount, cred, p, false, true, &clp); if (error) return (error); do { error = nfscl_relbytelock(vp, off, len, cred, p, callcnt, clp, id, flags, &lp, &dorpc); /* * If it returns a NULL lp, we're done. */ if (lp == NULL) { if (callcnt == 0) nfscl_clientrelease(clp); else nfscl_releasealllocks(clp, vp, p, id, flags); return (error); } if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; else clidrev = 0; /* * If the server doesn't support Posix lock semantics, * only allow locks on the entire file, since it won't * handle overlapping byte ranges. * There might still be a problem when a lock * upgrade/downgrade (read<->write) occurs, since the * server "might" expect an unlock first? */ if (dorpc && (lp->nfsl_open->nfso_posixlock || (off == 0 && len == NFS64BITSSET))) { /* * Since the lock records will go away, we must * wait for grace and delay here. */ do { error = nfsrpc_locku(nd, nmp, lp, off, len, NFSV4LOCKT_READ, cred, p, 0); if ((nd->nd_repstat == NFSERR_GRACE || nd->nd_repstat == NFSERR_DELAY) && error == 0) (void) nfs_catnap(PZERO, (int)nd->nd_repstat, "nfs_advlock"); } while ((nd->nd_repstat == NFSERR_GRACE || nd->nd_repstat == NFSERR_DELAY) && error == 0); } callcnt++; } while (error == 0 && nd->nd_repstat == 0); nfscl_releasealllocks(clp, vp, p, id, flags); } else if (op == F_SETLK) { error = nfscl_getbytelock(vp, off, len, fl->l_type, cred, p, NULL, 0, id, flags, NULL, NULL, &lp, &newone, &donelocally); if (error || donelocally) { return (error); } if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; else clidrev = 0; nfhp = VTONFS(vp)->n_fhp; if (!lp->nfsl_open->nfso_posixlock && (off != 0 || len != NFS64BITSSET)) { error = EINVAL; } else { error = nfsrpc_lock(nd, nmp, vp, nfhp->nfh_fh, nfhp->nfh_len, lp, newone, reclaim, off, len, fl->l_type, cred, p, 0); } if (!error) error = nd->nd_repstat; nfscl_lockrelease(lp, error, newone); } else { error = EINVAL; } if (!error) error = nd->nd_repstat; if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_STALECLIENTID || error == NFSERR_DELAY || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_advlock"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); retrycnt++; } } while (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_DELAY || error == NFSERR_STALEDONTRECOVER || error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; return (error); } /* * The lower level routine for the LockT case. */ int nfsrpc_lockt(struct nfsrv_descript *nd, vnode_t vp, struct nfsclclient *clp, u_int64_t off, u_int64_t len, struct flock *fl, struct ucred *cred, NFSPROC_T *p, void *id, int flags) { u_int32_t *tl; int error, type, size; uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX]; struct nfsnode *np; struct nfsmount *nmp; struct nfsclsession *tsep; nmp = VFSTONFS(vp->v_mount); NFSCL_REQSTART(nd, NFSPROC_LOCKT, vp); NFSM_BUILD(tl, u_int32_t *, 7 * NFSX_UNSIGNED); if (fl->l_type == F_RDLCK) *tl++ = txdr_unsigned(NFSV4LOCKT_READ); else *tl++ = txdr_unsigned(NFSV4LOCKT_WRITE); txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); tl += 2; tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; nfscl_filllockowner(id, own, flags); np = VTONFS(vp); NFSBCOPY(np->n_fhp->nfh_fh, &own[NFSV4CL_LOCKNAMELEN], np->n_fhp->nfh_len); (void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + np->n_fhp->nfh_len); error = nfscl_request(nd, vp, p, cred, NULL); if (error) return (error); if (nd->nd_repstat == 0) { fl->l_type = F_UNLCK; } else if (nd->nd_repstat == NFSERR_DENIED) { nd->nd_repstat = 0; fl->l_whence = SEEK_SET; NFSM_DISSECT(tl, u_int32_t *, 8 * NFSX_UNSIGNED); fl->l_start = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); tl += 2; if (len == NFS64BITSSET) fl->l_len = 0; else fl->l_len = len; type = fxdr_unsigned(int, *tl++); if (type == NFSV4LOCKT_WRITE) fl->l_type = F_WRLCK; else fl->l_type = F_RDLCK; /* * XXX For now, I have no idea what to do with the * conflicting lock_owner, so I'll just set the pid == 0 * and skip over the lock_owner. */ fl->l_pid = (pid_t)0; tl += 2; size = fxdr_unsigned(int, *tl); if (size < 0 || size > NFSV4_OPAQUELIMIT) error = EBADRPC; if (!error) error = nfsm_advance(nd, NFSM_RNDUP(size), -1); } else if (nd->nd_repstat == NFSERR_STALECLIENTID) nfscl_initiate_recovery(clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Lower level function that performs the LockU RPC. */ static int nfsrpc_locku(struct nfsrv_descript *nd, struct nfsmount *nmp, struct nfscllockowner *lp, u_int64_t off, u_int64_t len, u_int32_t type, struct ucred *cred, NFSPROC_T *p, int syscred) { u_int32_t *tl; int error; nfscl_reqstart(nd, NFSPROC_LOCKU, nmp, lp->nfsl_open->nfso_fh, lp->nfsl_open->nfso_fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(type); *tl = txdr_unsigned(lp->nfsl_seqid); if (nfstest_outofseq && (arc4random() % nfstest_outofseq) == 0) *tl = txdr_unsigned(lp->nfsl_seqid + 1); tl++; if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = lp->nfsl_stateid.seqid; *tl++ = lp->nfsl_stateid.other[0]; *tl++ = lp->nfsl_stateid.other[1]; *tl++ = lp->nfsl_stateid.other[2]; txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); NFSCL_INCRSEQID(lp->nfsl_seqid, nd); if (error) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); lp->nfsl_stateid.seqid = *tl++; lp->nfsl_stateid.other[0] = *tl++; lp->nfsl_stateid.other[1] = *tl++; lp->nfsl_stateid.other[2] = *tl; } else if (nd->nd_repstat == NFSERR_STALESTATEID) nfscl_initiate_recovery(lp->nfsl_open->nfso_own->nfsow_clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * The actual Lock RPC. */ int nfsrpc_lock(struct nfsrv_descript *nd, struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, struct nfscllockowner *lp, int newone, int reclaim, u_int64_t off, u_int64_t len, short type, struct ucred *cred, NFSPROC_T *p, int syscred) { u_int32_t *tl; int error, size; uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX]; struct nfsclsession *tsep; nfscl_reqstart(nd, NFSPROC_LOCK, nmp, nfhp, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, 7 * NFSX_UNSIGNED); if (type == F_RDLCK) *tl++ = txdr_unsigned(NFSV4LOCKT_READ); else *tl++ = txdr_unsigned(NFSV4LOCKT_WRITE); *tl++ = txdr_unsigned(reclaim); txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); tl += 2; if (newone) { *tl = newnfs_true; NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 2 * NFSX_UNSIGNED + NFSX_HYPER); *tl++ = txdr_unsigned(lp->nfsl_open->nfso_own->nfsow_seqid); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = lp->nfsl_open->nfso_stateid.seqid; *tl++ = lp->nfsl_open->nfso_stateid.other[0]; *tl++ = lp->nfsl_open->nfso_stateid.other[1]; *tl++ = lp->nfsl_open->nfso_stateid.other[2]; *tl++ = txdr_unsigned(lp->nfsl_seqid); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; NFSBCOPY(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN); NFSBCOPY(nfhp, &own[NFSV4CL_LOCKNAMELEN], fhlen); (void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + fhlen); } else { *tl = newnfs_false; NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = lp->nfsl_stateid.seqid; *tl++ = lp->nfsl_stateid.other[0]; *tl++ = lp->nfsl_stateid.other[1]; *tl++ = lp->nfsl_stateid.other[2]; *tl = txdr_unsigned(lp->nfsl_seqid); if (nfstest_outofseq && (arc4random() % nfstest_outofseq) == 0) *tl = txdr_unsigned(lp->nfsl_seqid + 1); } if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); if (newone) NFSCL_INCRSEQID(lp->nfsl_open->nfso_own->nfsow_seqid, nd); NFSCL_INCRSEQID(lp->nfsl_seqid, nd); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); lp->nfsl_stateid.seqid = *tl++; lp->nfsl_stateid.other[0] = *tl++; lp->nfsl_stateid.other[1] = *tl++; lp->nfsl_stateid.other[2] = *tl; } else if (nd->nd_repstat == NFSERR_DENIED) { NFSM_DISSECT(tl, u_int32_t *, 8 * NFSX_UNSIGNED); size = fxdr_unsigned(int, *(tl + 7)); if (size < 0 || size > NFSV4_OPAQUELIMIT) error = EBADRPC; if (!error) error = nfsm_advance(nd, NFSM_RNDUP(size), -1); } else if (nd->nd_repstat == NFSERR_STALESTATEID) nfscl_initiate_recovery(lp->nfsl_open->nfso_own->nfsow_clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs statfs rpc * (always called with the vp for the mount point) */ int nfsrpc_statfs(vnode_t vp, struct nfsstatfs *sbp, struct nfsfsinfo *fsp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl = NULL; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; nfsattrbit_t attrbits; int error; *attrflagp = 0; nmp = VFSTONFS(vp->v_mount); if (NFSHASNFSV4(nmp)) { /* * For V4, you actually do a getattr. */ NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp); NFSSTATFS_GETATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); nd->nd_flag |= ND_USEGSSNAME; error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_repstat == 0) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, sbp, fsp, NULL, 0, NULL, NULL, NULL, p, cred); if (!error) { nmp->nm_fsid[0] = nap->na_filesid[0]; nmp->nm_fsid[1] = nap->na_filesid[1]; NFSSETHASSETFSID(nmp); *attrflagp = 1; } } else { error = nd->nd_repstat; } if (error) goto nfsmout; } else { NFSCL_REQSTART(nd, NFSPROC_FSSTAT, vp); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) { error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (error) goto nfsmout; } if (nd->nd_repstat) { error = nd->nd_repstat; goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, NFSX_STATFS(nd->nd_flag & ND_NFSV3)); } if (NFSHASNFSV3(nmp)) { sbp->sf_tbytes = fxdr_hyper(tl); tl += 2; sbp->sf_fbytes = fxdr_hyper(tl); tl += 2; sbp->sf_abytes = fxdr_hyper(tl); tl += 2; sbp->sf_tfiles = fxdr_hyper(tl); tl += 2; sbp->sf_ffiles = fxdr_hyper(tl); tl += 2; sbp->sf_afiles = fxdr_hyper(tl); tl += 2; sbp->sf_invarsec = fxdr_unsigned(u_int32_t, *tl); } else if (NFSHASNFSV4(nmp) == 0) { sbp->sf_tsize = fxdr_unsigned(u_int32_t, *tl++); sbp->sf_bsize = fxdr_unsigned(u_int32_t, *tl++); sbp->sf_blocks = fxdr_unsigned(u_int32_t, *tl++); sbp->sf_bfree = fxdr_unsigned(u_int32_t, *tl++); sbp->sf_bavail = fxdr_unsigned(u_int32_t, *tl); } nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs pathconf rpc */ int nfsrpc_pathconf(vnode_t vp, struct nfsv3_pathconf *pc, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; u_int32_t *tl; nfsattrbit_t attrbits; int error; *attrflagp = 0; nmp = VFSTONFS(vp->v_mount); if (NFSHASNFSV4(nmp)) { /* * For V4, you actually do a getattr. */ NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp); NFSPATHCONF_GETATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); nd->nd_flag |= ND_USEGSSNAME; error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_repstat == 0) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, pc, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, cred); if (!error) *attrflagp = 1; } else { error = nd->nd_repstat; } } else { NFSCL_REQSTART(nd, NFSPROC_PATHCONF, vp); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (nd->nd_repstat && !error) error = nd->nd_repstat; if (!error) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V3PATHCONF); pc->pc_linkmax = fxdr_unsigned(u_int32_t, *tl++); pc->pc_namemax = fxdr_unsigned(u_int32_t, *tl++); pc->pc_notrunc = fxdr_unsigned(u_int32_t, *tl++); pc->pc_chownrestricted = fxdr_unsigned(u_int32_t, *tl++); pc->pc_caseinsensitive = fxdr_unsigned(u_int32_t, *tl++); pc->pc_casepreserving = fxdr_unsigned(u_int32_t, *tl); } } nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs version 3 fsinfo rpc call */ int nfsrpc_fsinfo(vnode_t vp, struct nfsfsinfo *fsp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; int error; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_FSINFO, vp); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (nd->nd_repstat && !error) error = nd->nd_repstat; if (!error) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V3FSINFO); fsp->fs_rtmax = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_rtpref = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_rtmult = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_wtmax = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_wtpref = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_wtmult = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_dtpref = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_maxfilesize = fxdr_hyper(tl); tl += 2; fxdr_nfsv3time(tl, &fsp->fs_timedelta); tl += 2; fsp->fs_properties = fxdr_unsigned(u_int32_t, *tl); } nfsmout: m_freem(nd->nd_mrep); return (error); } /* * This function performs the Renew RPC. */ int nfsrpc_renew(struct nfsclclient *clp, struct nfsclds *dsp, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct nfsmount *nmp; int error; struct nfssockreq *nrp; struct nfsclsession *tsep; nmp = clp->nfsc_nmp; if (nmp == NULL) return (0); if (dsp == NULL) nfscl_reqstart(nd, NFSPROC_RENEW, nmp, NULL, 0, NULL, NULL, 0, 0); else nfscl_reqstart(nd, NFSPROC_RENEW, nmp, NULL, 0, NULL, &dsp->nfsclds_sess, 0, 0); if (!NFSHASNFSV4N(nmp)) { /* NFSv4.1 just uses a Sequence Op and not a Renew. */ NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; } nrp = NULL; if (dsp != NULL) nrp = dsp->nfsclds_sockp; if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; nd->nd_flag |= ND_USEGSSNAME; if (dsp == NULL) error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); else { error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, &dsp->nfsclds_sess); if (error == ENXIO) nfscl_cancelreqs(dsp); } if (error) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * This function performs the Releaselockowner RPC. */ int nfsrpc_rellockown(struct nfsmount *nmp, struct nfscllockowner *lp, uint8_t *fh, int fhlen, struct ucred *cred, NFSPROC_T *p) { struct nfsrv_descript nfsd, *nd = &nfsd; u_int32_t *tl; int error; uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX]; struct nfsclsession *tsep; if (NFSHASNFSV4N(nmp)) { /* For NFSv4.1, do a FreeStateID. */ nfscl_reqstart(nd, NFSPROC_FREESTATEID, nmp, NULL, 0, NULL, NULL, 0, 0); nfsm_stateidtom(nd, &lp->nfsl_stateid, NFSSTATEID_PUTSTATEID); } else { nfscl_reqstart(nd, NFSPROC_RELEASELCKOWN, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; NFSBCOPY(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN); NFSBCOPY(fh, &own[NFSV4CL_LOCKNAMELEN], fhlen); (void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + fhlen); } nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * This function performs the Compound to get the mount pt FH. */ int nfsrpc_getdirpath(struct nfsmount *nmp, u_char *dirpath, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; u_char *cp, *cp2; int error, cnt, len, setnil; u_int32_t *opcntp; nfscl_reqstart(nd, NFSPROC_PUTROOTFH, nmp, NULL, 0, &opcntp, NULL, 0, 0); cp = dirpath; cnt = 0; do { setnil = 0; while (*cp == '/') cp++; cp2 = cp; while (*cp2 != '\0' && *cp2 != '/') cp2++; if (*cp2 == '/') { setnil = 1; *cp2 = '\0'; } if (cp2 != cp) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_LOOKUP); nfsm_strtom(nd, cp, strlen(cp)); cnt++; } if (setnil) *cp2++ = '/'; cp = cp2; } while (*cp != '\0'); if (NFSHASNFSV4N(nmp)) /* Has a Sequence Op done by nfscl_reqstart(). */ *opcntp = txdr_unsigned(3 + cnt); else *opcntp = txdr_unsigned(2 + cnt); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETFH); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, (3 + 2 * cnt) * NFSX_UNSIGNED); tl += (2 + 2 * cnt); if ((len = fxdr_unsigned(int, *tl)) <= 0 || len > NFSX_FHMAX) { nd->nd_repstat = NFSERR_BADXDR; } else { nd->nd_repstat = nfsrv_mtostr(nd, nmp->nm_fh, len); if (nd->nd_repstat == 0) nmp->nm_fhsize = len; } } error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * This function performs the Delegreturn RPC. */ int nfsrpc_delegreturn(struct nfscldeleg *dp, struct ucred *cred, struct nfsmount *nmp, NFSPROC_T *p, int syscred) { u_int32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error; nfscl_reqstart(nd, NFSPROC_DELEGRETURN, nmp, dp->nfsdl_fh, dp->nfsdl_fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = dp->nfsdl_stateid.seqid; *tl++ = dp->nfsdl_stateid.other[0]; *tl++ = dp->nfsdl_stateid.other[1]; *tl = dp->nfsdl_stateid.other[2]; if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * nfs getacl call. */ int nfsrpc_getacl(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct acl *aclp, void *stuff) { struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfsattrbit_t attrbits; struct nfsmount *nmp = VFSTONFS(vp->v_mount); if (nfsrv_useacl == 0 || !NFSHASNFSV4(nmp)) return (EOPNOTSUPP); NFSCL_REQSTART(nd, NFSPROC_GETACL, vp); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL); (void) nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (!nd->nd_repstat) error = nfsv4_loadattr(nd, vp, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, aclp, 0, NULL, NULL, NULL, p, cred); else error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * nfs setacl call. */ int nfsrpc_setacl(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct acl *aclp, void *stuff) { int error; struct nfsmount *nmp = VFSTONFS(vp->v_mount); if (nfsrv_useacl == 0 || !NFSHASNFSV4(nmp)) return (EOPNOTSUPP); error = nfsrpc_setattr(vp, NULL, aclp, cred, p, NULL, NULL, stuff); return (error); } /* * nfs setacl call. */ static int nfsrpc_setaclrpc(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct acl *aclp, nfsv4stateid_t *stateidp, void *stuff) { struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfsattrbit_t attrbits; struct nfsmount *nmp = VFSTONFS(vp->v_mount); if (!NFSHASNFSV4(nmp)) return (EOPNOTSUPP); NFSCL_REQSTART(nd, NFSPROC_SETACL, vp); nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL); (void) nfsv4_fillattr(nd, vp->v_mount, vp, aclp, NULL, NULL, 0, &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); /* Don't care about the pre/postop attributes */ m_freem(nd->nd_mrep); return (nd->nd_repstat); } /* * Do the NFSv4.1 Exchange ID. */ int nfsrpc_exchangeid(struct nfsmount *nmp, struct nfsclclient *clp, struct nfssockreq *nrp, int minorvers, uint32_t exchflags, struct nfsclds **dspp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl, v41flags; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct nfsclds *dsp; struct timespec verstime; int error, len; *dspp = NULL; if (minorvers == 0) minorvers = nmp->nm_minorvers; nfscl_reqstart(nd, NFSPROC_EXCHANGEID, nmp, NULL, 0, NULL, NULL, NFS_VER4, minorvers); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(nfsboottime.tv_sec); /* Client owner */ *tl = txdr_unsigned(clp->nfsc_rev); (void) nfsm_strtom(nd, clp->nfsc_id, clp->nfsc_idlen); NFSM_BUILD(tl, uint32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(exchflags); *tl++ = txdr_unsigned(NFSV4EXCH_SP4NONE); /* Set the implementation id4 */ *tl = txdr_unsigned(1); (void) nfsm_strtom(nd, "freebsd.org", strlen("freebsd.org")); (void) nfsm_strtom(nd, version, strlen(version)); NFSM_BUILD(tl, uint32_t *, NFSX_V4TIME); verstime.tv_sec = 1293840000; /* Jan 1, 2011 */ verstime.tv_nsec = 0; txdr_nfsv4time(&verstime, tl); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); NFSCL_DEBUG(1, "exchangeid err=%d reps=%d\n", error, (int)nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 6 * NFSX_UNSIGNED + NFSX_HYPER); len = fxdr_unsigned(int, *(tl + 7)); if (len < 0 || len > NFSV4_OPAQUELIMIT) { error = NFSERR_BADXDR; goto nfsmout; } dsp = malloc(sizeof(struct nfsclds) + len + 1, M_NFSCLDS, M_WAITOK | M_ZERO); dsp->nfsclds_expire = NFSD_MONOSEC + clp->nfsc_renew; dsp->nfsclds_servownlen = len; dsp->nfsclds_sess.nfsess_clientid.lval[0] = *tl++; dsp->nfsclds_sess.nfsess_clientid.lval[1] = *tl++; dsp->nfsclds_sess.nfsess_sequenceid = fxdr_unsigned(uint32_t, *tl++); v41flags = fxdr_unsigned(uint32_t, *tl); if ((v41flags & NFSV4EXCH_USEPNFSMDS) != 0 && NFSHASPNFSOPT(nmp)) { NFSCL_DEBUG(1, "set PNFS\n"); NFSLOCKMNT(nmp); nmp->nm_state |= NFSSTA_PNFS; NFSUNLOCKMNT(nmp); dsp->nfsclds_flags |= NFSCLDS_MDS; } if ((v41flags & NFSV4EXCH_USEPNFSDS) != 0) dsp->nfsclds_flags |= NFSCLDS_DS; if (minorvers == NFSV42_MINORVERSION) dsp->nfsclds_flags |= NFSCLDS_MINORV2; if (len > 0) nd->nd_repstat = nfsrv_mtostr(nd, dsp->nfsclds_serverown, len); if (nd->nd_repstat == 0) { mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF); mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession", NULL, MTX_DEF); nfscl_initsessionslots(&dsp->nfsclds_sess); *dspp = dsp; } else free(dsp, M_NFSCLDS); } error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 Create Session. */ int nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep, struct nfssockreq *nrp, struct nfsclds *dsp, uint32_t sequenceid, int mds, struct ucred *cred, NFSPROC_T *p) { uint32_t crflags, maxval, *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error, irdcnt, minorvers; /* Make sure nm_rsize, nm_wsize is set. */ if (nmp->nm_rsize > NFS_MAXBSIZE || nmp->nm_rsize == 0) nmp->nm_rsize = NFS_MAXBSIZE; if (nmp->nm_wsize > NFS_MAXBSIZE || nmp->nm_wsize == 0) nmp->nm_wsize = NFS_MAXBSIZE; if (dsp == NULL) minorvers = nmp->nm_minorvers; else if ((dsp->nfsclds_flags & NFSCLDS_MINORV2) != 0) minorvers = NFSV42_MINORVERSION; else minorvers = NFSV41_MINORVERSION; nfscl_reqstart(nd, NFSPROC_CREATESESSION, nmp, NULL, 0, NULL, NULL, NFS_VER4, minorvers); NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED); *tl++ = sep->nfsess_clientid.lval[0]; *tl++ = sep->nfsess_clientid.lval[1]; *tl++ = txdr_unsigned(sequenceid); crflags = (NFSMNT_RDONLY(nmp->nm_mountp) ? 0 : NFSV4CRSESS_PERSIST); if (nfscl_enablecallb != 0 && nfs_numnfscbd > 0 && mds != 0) crflags |= NFSV4CRSESS_CONNBACKCHAN; *tl = txdr_unsigned(crflags); /* Fill in fore channel attributes. */ NFSM_BUILD(tl, uint32_t *, 7 * NFSX_UNSIGNED); *tl++ = 0; /* Header pad size */ if ((nd->nd_flag & ND_NFSV42) != 0 && mds != 0 && sb_max_adj >= nmp->nm_wsize && sb_max_adj >= nmp->nm_rsize) { /* * NFSv4.2 Extended Attribute operations may want to do * requests/replies that are larger than nm_rsize/nm_wsize. */ *tl++ = txdr_unsigned(sb_max_adj - NFS_MAXXDR); *tl++ = txdr_unsigned(sb_max_adj - NFS_MAXXDR); } else { *tl++ = txdr_unsigned(nmp->nm_wsize + NFS_MAXXDR); *tl++ = txdr_unsigned(nmp->nm_rsize + NFS_MAXXDR); } *tl++ = txdr_unsigned(4096); /* Max response size cached */ *tl++ = txdr_unsigned(20); /* Max operations */ *tl++ = txdr_unsigned(64); /* Max slots */ *tl = 0; /* No rdma ird */ /* Fill in back channel attributes. */ NFSM_BUILD(tl, uint32_t *, 7 * NFSX_UNSIGNED); *tl++ = 0; /* Header pad size */ *tl++ = txdr_unsigned(10000); /* Max request size */ *tl++ = txdr_unsigned(10000); /* Max response size */ *tl++ = txdr_unsigned(4096); /* Max response size cached */ *tl++ = txdr_unsigned(4); /* Max operations */ *tl++ = txdr_unsigned(NFSV4_CBSLOTS); /* Max slots */ *tl = 0; /* No rdma ird */ NFSM_BUILD(tl, uint32_t *, 8 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFS_CALLBCKPROG); /* Call back prog # */ /* Allow AUTH_SYS callbacks as uid, gid == 0. */ *tl++ = txdr_unsigned(1); /* Auth_sys only */ *tl++ = txdr_unsigned(AUTH_SYS); /* AUTH_SYS type */ *tl++ = txdr_unsigned(nfsboottime.tv_sec); /* time stamp */ *tl++ = 0; /* Null machine name */ *tl++ = 0; /* Uid == 0 */ *tl++ = 0; /* Gid == 0 */ *tl = 0; /* No additional gids */ nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID + 2 * NFSX_UNSIGNED); bcopy(tl, sep->nfsess_sessionid, NFSX_V4SESSIONID); tl += NFSX_V4SESSIONID / NFSX_UNSIGNED; sep->nfsess_sequenceid = fxdr_unsigned(uint32_t, *tl++); crflags = fxdr_unsigned(uint32_t, *tl); if ((crflags & NFSV4CRSESS_PERSIST) != 0 && mds != 0) { NFSLOCKMNT(nmp); nmp->nm_state |= NFSSTA_SESSPERSIST; NFSUNLOCKMNT(nmp); } /* Get the fore channel slot count. */ NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED); tl++; /* Skip the header pad size. */ /* Make sure nm_wsize is small enough. */ maxval = fxdr_unsigned(uint32_t, *tl++); while (maxval < nmp->nm_wsize + NFS_MAXXDR) { if (nmp->nm_wsize > 8096) nmp->nm_wsize /= 2; else break; } sep->nfsess_maxreq = maxval; /* Make sure nm_rsize is small enough. */ maxval = fxdr_unsigned(uint32_t, *tl++); while (maxval < nmp->nm_rsize + NFS_MAXXDR) { if (nmp->nm_rsize > 8096) nmp->nm_rsize /= 2; else break; } sep->nfsess_maxresp = maxval; sep->nfsess_maxcache = fxdr_unsigned(int, *tl++); tl++; sep->nfsess_foreslots = fxdr_unsigned(uint16_t, *tl++); NFSCL_DEBUG(4, "fore slots=%d\n", (int)sep->nfsess_foreslots); irdcnt = fxdr_unsigned(int, *tl); if (irdcnt > 0) NFSM_DISSECT(tl, uint32_t *, irdcnt * NFSX_UNSIGNED); /* and the back channel slot count. */ NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED); tl += 5; sep->nfsess_backslots = fxdr_unsigned(uint16_t, *tl); NFSCL_DEBUG(4, "back slots=%d\n", (int)sep->nfsess_backslots); } error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 Destroy Session. */ int nfsrpc_destroysession(struct nfsmount *nmp, struct nfsclclient *clp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error; struct nfsclsession *tsep; nfscl_reqstart(nd, NFSPROC_DESTROYSESSION, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID); tsep = nfsmnt_mdssession(nmp); bcopy(tsep->nfsess_sessionid, tl, NFSX_V4SESSIONID); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 Destroy Client. */ int nfsrpc_destroyclient(struct nfsmount *nmp, struct nfsclclient *clp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error; struct nfsclsession *tsep; nfscl_reqstart(nd, NFSPROC_DESTROYCLIENT, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 LayoutGet. */ static int nfsrpc_layoutget(struct nfsmount *nmp, uint8_t *fhp, int fhlen, int iomode, uint64_t offset, uint64_t len, uint64_t minlen, int layouttype, int layoutlen, nfsv4stateid_t *stateidp, int *retonclosep, struct nfsclflayouthead *flhp, struct ucred *cred, NFSPROC_T *p, void *stuff) { struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfscl_reqstart(nd, NFSPROC_LAYOUTGET, nmp, fhp, fhlen, NULL, NULL, 0, 0); nfsrv_setuplayoutget(nd, iomode, offset, len, minlen, stateidp, layouttype, layoutlen, 0); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); NFSCL_DEBUG(4, "layget err=%d st=%d\n", error, nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat == 0) error = nfsrv_parselayoutget(nmp, nd, stateidp, retonclosep, flhp); if (error == 0 && nd->nd_repstat != 0) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 Get Device Info. */ int nfsrpc_getdeviceinfo(struct nfsmount *nmp, uint8_t *deviceid, int layouttype, uint32_t *notifybitsp, struct nfscldevinfo **ndip, struct ucred *cred, NFSPROC_T *p) { uint32_t cnt, *tl, vers, minorvers; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct sockaddr_in sin, ssin; struct sockaddr_in6 sin6, ssin6; struct nfsclds *dsp = NULL, **dspp, **gotdspp; struct nfscldevinfo *ndi; int addrcnt = 0, bitcnt, error, gotminor, gotvers, i, isudp, j; int stripecnt; uint8_t stripeindex; sa_family_t af, safilled; ssin.sin_port = 0; /* To shut up compiler. */ ssin.sin_addr.s_addr = 0; /* ditto */ *ndip = NULL; ndi = NULL; gotdspp = NULL; nfscl_reqstart(nd, NFSPROC_GETDEVICEINFO, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, NFSX_V4DEVICEID + 3 * NFSX_UNSIGNED); NFSBCOPY(deviceid, tl, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); *tl++ = txdr_unsigned(layouttype); *tl++ = txdr_unsigned(100000); if (notifybitsp != NULL && *notifybitsp != 0) { *tl = txdr_unsigned(1); /* One word of bits. */ NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(*notifybitsp); } else *tl = txdr_unsigned(0); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (layouttype != fxdr_unsigned(int, *tl)) printf("EEK! devinfo layout type not same!\n"); if (layouttype == NFSLAYOUT_NFSV4_1_FILES) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); stripecnt = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "stripecnt=%d\n", stripecnt); if (stripecnt < 1 || stripecnt > 4096) { printf("pNFS File layout devinfo stripecnt %d:" " out of range\n", stripecnt); error = NFSERR_BADXDR; goto nfsmout; } NFSM_DISSECT(tl, uint32_t *, (stripecnt + 1) * NFSX_UNSIGNED); addrcnt = fxdr_unsigned(int, *(tl + stripecnt)); NFSCL_DEBUG(4, "addrcnt=%d\n", addrcnt); if (addrcnt < 1 || addrcnt > 128) { printf("NFS devinfo addrcnt %d: out of range\n", addrcnt); error = NFSERR_BADXDR; goto nfsmout; } /* * Now we know how many stripe indices and addresses, so * we can allocate the structure the correct size. */ i = (stripecnt * sizeof(uint8_t)) / sizeof(struct nfsclds *) + 1; NFSCL_DEBUG(4, "stripeindices=%d\n", i); ndi = malloc(sizeof(*ndi) + (addrcnt + i) * sizeof(struct nfsclds *), M_NFSDEVINFO, M_WAITOK | M_ZERO); NFSBCOPY(deviceid, ndi->nfsdi_deviceid, NFSX_V4DEVICEID); ndi->nfsdi_refcnt = 0; ndi->nfsdi_flags = NFSDI_FILELAYOUT; ndi->nfsdi_stripecnt = stripecnt; ndi->nfsdi_addrcnt = addrcnt; /* Fill in the stripe indices. */ for (i = 0; i < stripecnt; i++) { stripeindex = fxdr_unsigned(uint8_t, *tl++); NFSCL_DEBUG(4, "stripeind=%d\n", stripeindex); if (stripeindex >= addrcnt) { printf("pNFS File Layout devinfo" " stripeindex %d: too big\n", (int)stripeindex); error = NFSERR_BADXDR; goto nfsmout; } nfsfldi_setstripeindex(ndi, i, stripeindex); } } else if (layouttype == NFSLAYOUT_FLEXFILE) { /* For Flex File, we only get one address list. */ ndi = malloc(sizeof(*ndi) + sizeof(struct nfsclds *), M_NFSDEVINFO, M_WAITOK | M_ZERO); NFSBCOPY(deviceid, ndi->nfsdi_deviceid, NFSX_V4DEVICEID); ndi->nfsdi_refcnt = 0; ndi->nfsdi_flags = NFSDI_FLEXFILE; addrcnt = ndi->nfsdi_addrcnt = 1; } /* Now, dissect the server address(es). */ safilled = AF_UNSPEC; for (i = 0; i < addrcnt; i++) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); cnt = fxdr_unsigned(uint32_t, *tl); if (cnt == 0) { printf("NFS devinfo 0 len addrlist\n"); error = NFSERR_BADXDR; goto nfsmout; } dspp = nfsfldi_addr(ndi, i); safilled = AF_UNSPEC; for (j = 0; j < cnt; j++) { error = nfsv4_getipaddr(nd, &sin, &sin6, &af, &isudp); if (error != 0 && error != EPERM) { error = NFSERR_BADXDR; goto nfsmout; } if (error == 0 && isudp == 0) { /* * The priority is: * - Same address family. * Save the address and dspp, so that * the connection can be done after * parsing is complete. */ if (safilled == AF_UNSPEC || (af == nmp->nm_nam->sa_family && safilled != nmp->nm_nam->sa_family) ) { if (af == AF_INET) ssin = sin; else ssin6 = sin6; safilled = af; gotdspp = dspp; } } } } gotvers = NFS_VER4; /* Default NFSv4.1 for File Layout. */ gotminor = NFSV41_MINORVERSION; /* For Flex File, we will take one of the versions to use. */ if (layouttype == NFSLAYOUT_FLEXFILE) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); j = fxdr_unsigned(int, *tl); if (j < 1 || j > NFSDEV_MAXVERS) { printf("pNFS: too many versions\n"); error = NFSERR_BADXDR; goto nfsmout; } gotvers = 0; gotminor = 0; for (i = 0; i < j; i++) { NFSM_DISSECT(tl, uint32_t *, 5 * NFSX_UNSIGNED); vers = fxdr_unsigned(uint32_t, *tl++); minorvers = fxdr_unsigned(uint32_t, *tl++); if (vers == NFS_VER3) minorvers = 0; if ((vers == NFS_VER4 && ((minorvers == NFSV41_MINORVERSION && gotminor == 0) || minorvers == NFSV42_MINORVERSION)) || (vers == NFS_VER3 && gotvers == 0)) { gotvers = vers; gotminor = minorvers; /* We'll take this one. */ ndi->nfsdi_versindex = i; ndi->nfsdi_vers = vers; ndi->nfsdi_minorvers = minorvers; ndi->nfsdi_rsize = fxdr_unsigned( uint32_t, *tl++); ndi->nfsdi_wsize = fxdr_unsigned( uint32_t, *tl++); if (*tl == newnfs_true) ndi->nfsdi_flags |= NFSDI_TIGHTCOUPLED; else ndi->nfsdi_flags &= ~NFSDI_TIGHTCOUPLED; } } if (gotvers == 0) { printf("pNFS: no NFSv3, NFSv4.1 or NFSv4.2\n"); error = NFSERR_BADXDR; goto nfsmout; } } /* And the notify bits. */ NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); bitcnt = fxdr_unsigned(int, *tl); if (bitcnt > 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); if (notifybitsp != NULL) *notifybitsp = fxdr_unsigned(uint32_t, *tl); } if (safilled != AF_UNSPEC) { KASSERT(ndi != NULL, ("ndi is NULL")); *ndip = ndi; } else error = EPERM; if (error == 0) { /* * Now we can do a TCP connection for the correct * NFS version and IP address. */ error = nfsrpc_fillsa(nmp, &ssin, &ssin6, safilled, gotvers, gotminor, &dsp, p); } if (error == 0) { KASSERT(gotdspp != NULL, ("gotdspp is NULL")); *gotdspp = dsp; } } if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; nfsmout: if (error != 0 && ndi != NULL) nfscl_freedevinfo(ndi); m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 LayoutCommit. */ int nfsrpc_layoutcommit(struct nfsmount *nmp, uint8_t *fh, int fhlen, int reclaim, uint64_t off, uint64_t len, uint64_t lastbyte, nfsv4stateid_t *stateidp, int layouttype, struct ucred *cred, NFSPROC_T *p, void *stuff) { uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfscl_reqstart(nd, NFSPROC_LAYOUTCOMMIT, nmp, fh, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, 5 * NFSX_UNSIGNED + 3 * NFSX_HYPER + NFSX_STATEID); txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); tl += 2; if (reclaim != 0) *tl++ = newnfs_true; else *tl++ = newnfs_false; *tl++ = txdr_unsigned(stateidp->seqid); *tl++ = stateidp->other[0]; *tl++ = stateidp->other[1]; *tl++ = stateidp->other[2]; *tl++ = newnfs_true; if (lastbyte < off) lastbyte = off; else if (lastbyte >= (off + len)) lastbyte = off + len - 1; txdr_hyper(lastbyte, tl); tl += 2; *tl++ = newnfs_false; *tl++ = txdr_unsigned(layouttype); /* All supported layouts are 0 length. */ *tl = txdr_unsigned(0); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 LayoutReturn. */ int nfsrpc_layoutreturn(struct nfsmount *nmp, uint8_t *fh, int fhlen, int reclaim, int layouttype, uint32_t iomode, int layoutreturn, uint64_t offset, uint64_t len, nfsv4stateid_t *stateidp, struct ucred *cred, NFSPROC_T *p, uint32_t stat, uint32_t op, char *devid) { uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; uint64_t tu64; int error; nfscl_reqstart(nd, NFSPROC_LAYOUTRETURN, nmp, fh, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED); if (reclaim != 0) *tl++ = newnfs_true; else *tl++ = newnfs_false; *tl++ = txdr_unsigned(layouttype); *tl++ = txdr_unsigned(iomode); *tl = txdr_unsigned(layoutreturn); if (layoutreturn == NFSLAYOUTRETURN_FILE) { NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_STATEID + NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; txdr_hyper(len, tl); tl += 2; NFSCL_DEBUG(4, "layoutret stseq=%d\n", (int)stateidp->seqid); *tl++ = txdr_unsigned(stateidp->seqid); *tl++ = stateidp->other[0]; *tl++ = stateidp->other[1]; *tl++ = stateidp->other[2]; if (layouttype == NFSLAYOUT_NFSV4_1_FILES) *tl = txdr_unsigned(0); else if (layouttype == NFSLAYOUT_FLEXFILE) { if (stat != 0) { *tl = txdr_unsigned(2 * NFSX_HYPER + NFSX_STATEID + NFSX_V4DEVICEID + 5 * NFSX_UNSIGNED); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_STATEID + NFSX_V4DEVICEID + 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(1); /* One error. */ tu64 = 0; /* Offset. */ txdr_hyper(tu64, tl); tl += 2; tu64 = UINT64_MAX; /* Length. */ txdr_hyper(tu64, tl); tl += 2; NFSBCOPY(stateidp, tl, NFSX_STATEID); tl += (NFSX_STATEID / NFSX_UNSIGNED); *tl++ = txdr_unsigned(1); /* One error. */ NFSBCOPY(devid, tl, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); *tl++ = txdr_unsigned(stat); *tl++ = txdr_unsigned(op); } else { *tl = txdr_unsigned(2 * NFSX_UNSIGNED); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); /* No ioerrs. */ *tl++ = 0; } *tl = 0; /* No stats yet. */ } } nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); if (*tl != 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID); stateidp->seqid = fxdr_unsigned(uint32_t, *tl++); stateidp->other[0] = *tl++; stateidp->other[1] = *tl++; stateidp->other[2] = *tl; } } else error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Acquire a layout and devinfo, if possible. The caller must have acquired * a reference count on the nfsclclient structure before calling this. * Return the layout in lypp with a reference count on it, if successful. */ static int nfsrpc_getlayout(struct nfsmount *nmp, vnode_t vp, struct nfsfh *nfhp, int iomode, uint32_t rw, uint32_t *notifybitsp, nfsv4stateid_t *stateidp, uint64_t off, struct nfscllayout **lypp, struct ucred *cred, NFSPROC_T *p) { struct nfscllayout *lyp; struct nfsclflayout *flp; struct nfsclflayouthead flh; int error = 0, islocked, layoutlen, layouttype, recalled, retonclose; nfsv4stateid_t stateid; struct nfsclsession *tsep; *lypp = NULL; if (NFSHASFLEXFILE(nmp)) layouttype = NFSLAYOUT_FLEXFILE; else layouttype = NFSLAYOUT_NFSV4_1_FILES; /* * If lyp is returned non-NULL, there will be a refcnt (shared lock) * on it, iff flp != NULL or a lock (exclusive lock) on it iff * flp == NULL. */ lyp = nfscl_getlayout(nmp->nm_clp, nfhp->nfh_fh, nfhp->nfh_len, off, rw, &flp, &recalled); islocked = 0; if (lyp == NULL || flp == NULL) { if (recalled != 0) return (EIO); LIST_INIT(&flh); tsep = nfsmnt_mdssession(nmp); layoutlen = tsep->nfsess_maxcache - (NFSX_STATEID + 3 * NFSX_UNSIGNED); if (lyp == NULL) { stateid.seqid = 0; stateid.other[0] = stateidp->other[0]; stateid.other[1] = stateidp->other[1]; stateid.other[2] = stateidp->other[2]; error = nfsrpc_layoutget(nmp, nfhp->nfh_fh, nfhp->nfh_len, iomode, (uint64_t)0, UINT64_MAX, (uint64_t)0, layouttype, layoutlen, &stateid, &retonclose, &flh, cred, p, NULL); } else { islocked = 1; stateid.seqid = lyp->nfsly_stateid.seqid; stateid.other[0] = lyp->nfsly_stateid.other[0]; stateid.other[1] = lyp->nfsly_stateid.other[1]; stateid.other[2] = lyp->nfsly_stateid.other[2]; error = nfsrpc_layoutget(nmp, nfhp->nfh_fh, nfhp->nfh_len, iomode, off, UINT64_MAX, (uint64_t)0, layouttype, layoutlen, &stateid, &retonclose, &flh, cred, p, NULL); } error = nfsrpc_layoutgetres(nmp, vp, nfhp->nfh_fh, nfhp->nfh_len, &stateid, retonclose, notifybitsp, &lyp, &flh, layouttype, error, NULL, cred, p); if (error == 0) *lypp = lyp; else if (islocked != 0) nfscl_rellayout(lyp, 1); } else *lypp = lyp; return (error); } /* * Do a TCP connection plus exchange id and create session. * If successful, a "struct nfsclds" is linked into the list for the * mount point and a pointer to it is returned. */ static int nfsrpc_fillsa(struct nfsmount *nmp, struct sockaddr_in *sin, struct sockaddr_in6 *sin6, sa_family_t af, int vers, int minorvers, struct nfsclds **dspp, NFSPROC_T *p) { struct sockaddr_in *msad, *sad; struct sockaddr_in6 *msad6, *sad6; struct nfsclclient *clp; struct nfssockreq *nrp; struct nfsclds *dsp, *tdsp; int error, firsttry; enum nfsclds_state retv; uint32_t sequenceid = 0; KASSERT(nmp->nm_sockreq.nr_cred != NULL, ("nfsrpc_fillsa: NULL nr_cred")); NFSLOCKCLSTATE(); clp = nmp->nm_clp; NFSUNLOCKCLSTATE(); if (clp == NULL) return (EPERM); if (af == AF_INET) { NFSLOCKMNT(nmp); /* * Check to see if we already have a session for this * address that is usable for a DS. * Note that the MDS's address is in a different place * than the sessions already acquired for DS's. */ msad = (struct sockaddr_in *)nmp->nm_sockreq.nr_nam; tdsp = TAILQ_FIRST(&nmp->nm_sess); while (tdsp != NULL) { if (msad != NULL && msad->sin_family == AF_INET && sin->sin_addr.s_addr == msad->sin_addr.s_addr && sin->sin_port == msad->sin_port && (tdsp->nfsclds_flags & NFSCLDS_DS) != 0 && tdsp->nfsclds_sess.nfsess_defunct == 0) { *dspp = tdsp; NFSUNLOCKMNT(nmp); NFSCL_DEBUG(4, "fnd same addr\n"); return (0); } tdsp = TAILQ_NEXT(tdsp, nfsclds_list); if (tdsp != NULL && tdsp->nfsclds_sockp != NULL) msad = (struct sockaddr_in *) tdsp->nfsclds_sockp->nr_nam; else msad = NULL; } NFSUNLOCKMNT(nmp); /* No IP address match, so look for new/trunked one. */ sad = malloc(sizeof(*sad), M_SONAME, M_WAITOK | M_ZERO); sad->sin_len = sizeof(*sad); sad->sin_family = AF_INET; sad->sin_port = sin->sin_port; sad->sin_addr.s_addr = sin->sin_addr.s_addr; nrp = malloc(sizeof(*nrp), M_NFSSOCKREQ, M_WAITOK | M_ZERO); nrp->nr_nam = (struct sockaddr *)sad; } else if (af == AF_INET6) { NFSLOCKMNT(nmp); /* * Check to see if we already have a session for this * address that is usable for a DS. * Note that the MDS's address is in a different place * than the sessions already acquired for DS's. */ msad6 = (struct sockaddr_in6 *)nmp->nm_sockreq.nr_nam; tdsp = TAILQ_FIRST(&nmp->nm_sess); while (tdsp != NULL) { if (msad6 != NULL && msad6->sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &msad6->sin6_addr) && sin6->sin6_port == msad6->sin6_port && (tdsp->nfsclds_flags & NFSCLDS_DS) != 0 && tdsp->nfsclds_sess.nfsess_defunct == 0) { *dspp = tdsp; NFSUNLOCKMNT(nmp); return (0); } tdsp = TAILQ_NEXT(tdsp, nfsclds_list); if (tdsp != NULL && tdsp->nfsclds_sockp != NULL) msad6 = (struct sockaddr_in6 *) tdsp->nfsclds_sockp->nr_nam; else msad6 = NULL; } NFSUNLOCKMNT(nmp); /* No IP address match, so look for new/trunked one. */ sad6 = malloc(sizeof(*sad6), M_SONAME, M_WAITOK | M_ZERO); sad6->sin6_len = sizeof(*sad6); sad6->sin6_family = AF_INET6; sad6->sin6_port = sin6->sin6_port; NFSBCOPY(&sin6->sin6_addr, &sad6->sin6_addr, sizeof(struct in6_addr)); nrp = malloc(sizeof(*nrp), M_NFSSOCKREQ, M_WAITOK | M_ZERO); nrp->nr_nam = (struct sockaddr *)sad6; } else return (EPERM); nrp->nr_sotype = SOCK_STREAM; mtx_init(&nrp->nr_mtx, "nfssock", NULL, MTX_DEF); nrp->nr_prog = NFS_PROG; nrp->nr_vers = vers; /* * Use the credentials that were used for the mount, which are * in nmp->nm_sockreq.nr_cred for newnfs_connect() etc. * Ref. counting the credentials with crhold() is probably not * necessary, since nm_sockreq.nr_cred won't be crfree()'d until * unmount, but I did it anyhow. */ nrp->nr_cred = crhold(nmp->nm_sockreq.nr_cred); error = newnfs_connect(nmp, nrp, NULL, p, 0, false, &nrp->nr_client); NFSCL_DEBUG(3, "DS connect=%d\n", error); dsp = NULL; /* Now, do the exchangeid and create session. */ if (error == 0) { if (vers == NFS_VER4) { firsttry = 0; do { error = nfsrpc_exchangeid(nmp, clp, nrp, minorvers, NFSV4EXCH_USEPNFSDS, &dsp, nrp->nr_cred, p); NFSCL_DEBUG(3, "DS exchangeid=%d\n", error); if (error == NFSERR_MINORVERMISMATCH) minorvers = NFSV42_MINORVERSION; } while (error == NFSERR_MINORVERMISMATCH && firsttry++ == 0); if (error != 0) newnfs_disconnect(NULL, nrp); } else { dsp = malloc(sizeof(struct nfsclds), M_NFSCLDS, M_WAITOK | M_ZERO); dsp->nfsclds_flags |= NFSCLDS_DS; dsp->nfsclds_expire = INT32_MAX; /* No renews needed. */ mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF); mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession", NULL, MTX_DEF); } } if (error == 0) { dsp->nfsclds_sockp = nrp; if (vers == NFS_VER4) { NFSLOCKMNT(nmp); retv = nfscl_getsameserver(nmp, dsp, &tdsp, &sequenceid); NFSCL_DEBUG(3, "getsame ret=%d\n", retv); if (retv == NFSDSP_USETHISSESSION && nfscl_dssameconn != 0) { NFSLOCKDS(tdsp); tdsp->nfsclds_flags |= NFSCLDS_SAMECONN; NFSUNLOCKDS(tdsp); NFSUNLOCKMNT(nmp); /* * If there is already a session for this * server, use it. */ newnfs_disconnect(NULL, nrp); nfscl_freenfsclds(dsp); *dspp = tdsp; return (0); } if (retv == NFSDSP_NOTFOUND) sequenceid = dsp->nfsclds_sess.nfsess_sequenceid; NFSUNLOCKMNT(nmp); error = nfsrpc_createsession(nmp, &dsp->nfsclds_sess, nrp, dsp, sequenceid, 0, nrp->nr_cred, p); NFSCL_DEBUG(3, "DS createsess=%d\n", error); } } else { NFSFREECRED(nrp->nr_cred); NFSFREEMUTEX(&nrp->nr_mtx); free(nrp->nr_nam, M_SONAME); free(nrp, M_NFSSOCKREQ); } if (error == 0) { NFSCL_DEBUG(3, "add DS session\n"); /* * Put it at the end of the list. That way the list * is ordered by when the entry was added. This matters * since the one done first is the one that should be * used for sequencid'ing any subsequent create sessions. */ NFSLOCKMNT(nmp); TAILQ_INSERT_TAIL(&nmp->nm_sess, dsp, nfsclds_list); NFSUNLOCKMNT(nmp); *dspp = dsp; } else if (dsp != NULL) { newnfs_disconnect(NULL, nrp); nfscl_freenfsclds(dsp); } return (error); } /* * Do the NFSv4.1 Reclaim Complete. */ int nfsrpc_reclaimcomplete(struct nfsmount *nmp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error; nfscl_reqstart(nd, NFSPROC_RECLAIMCOMPL, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = newnfs_false; nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Initialize the slot tables for a session. */ static void nfscl_initsessionslots(struct nfsclsession *sep) { int i; for (i = 0; i < NFSV4_CBSLOTS; i++) { if (sep->nfsess_cbslots[i].nfssl_reply != NULL) m_freem(sep->nfsess_cbslots[i].nfssl_reply); NFSBZERO(&sep->nfsess_cbslots[i], sizeof(struct nfsslot)); } for (i = 0; i < 64; i++) sep->nfsess_slotseq[i] = 0; sep->nfsess_slots = 0; } /* * Called to try and do an I/O operation via an NFSv4.1 Data Server (DS). */ int nfscl_doiods(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, uint32_t rwaccess, int docommit, struct ucred *cred, NFSPROC_T *p) { struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfscllayout *layp; struct nfscldevinfo *dip; struct nfsclflayout *rflp; struct mbuf *m, *m2; struct nfsclwritedsdorpc *drpc, *tdrpc; nfsv4stateid_t stateid; struct ucred *newcred; uint64_t lastbyte, len, off, oresid, xfer; int eof, error, firstmirror, i, iolaymode, mirrorcnt, recalled, timo; void *lckp; uint8_t *dev; void *iovbase = NULL; size_t iovlen = 0; off_t offs = 0; ssize_t resid = 0; if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 || nfs_numnfscbd == 0 || (np->n_flag & NNOLAYOUT) != 0) return (EIO); /* Now, get a reference cnt on the clientid for this mount. */ if (nfscl_getref(nmp) == 0) return (EIO); /* Find an appropriate stateid. */ newcred = NFSNEWCRED(cred); error = nfscl_getstateid(vp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, rwaccess, 1, newcred, p, &stateid, &lckp); if (error != 0) { NFSFREECRED(newcred); nfscl_relref(nmp); return (error); } /* Search for a layout for this file. */ off = uiop->uio_offset; layp = nfscl_getlayout(nmp->nm_clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, off, rwaccess, &rflp, &recalled); if (layp == NULL || rflp == NULL) { if (recalled != 0) { NFSFREECRED(newcred); nfscl_relref(nmp); return (EIO); } if (layp != NULL) { nfscl_rellayout(layp, (rflp == NULL) ? 1 : 0); layp = NULL; } /* Try and get a Layout, if it is supported. */ if (rwaccess == NFSV4OPEN_ACCESSWRITE || (np->n_flag & NWRITEOPENED) != 0) iolaymode = NFSLAYOUTIOMODE_RW; else iolaymode = NFSLAYOUTIOMODE_READ; error = nfsrpc_getlayout(nmp, vp, np->n_fhp, iolaymode, rwaccess, NULL, &stateid, off, &layp, newcred, p); if (error != 0) { NFSLOCKNODE(np); np->n_flag |= NNOLAYOUT; NFSUNLOCKNODE(np); if (lckp != NULL) nfscl_lockderef(lckp); NFSFREECRED(newcred); if (layp != NULL) nfscl_rellayout(layp, 0); nfscl_relref(nmp); return (error); } } /* * Loop around finding a layout that works for the first part of * this I/O operation, and then call the function that actually * does the RPC. */ eof = 0; len = (uint64_t)uiop->uio_resid; while (len > 0 && error == 0 && eof == 0) { off = uiop->uio_offset; error = nfscl_findlayoutforio(layp, off, rwaccess, &rflp); if (error == 0) { oresid = xfer = (uint64_t)uiop->uio_resid; if (xfer > (rflp->nfsfl_end - rflp->nfsfl_off)) xfer = rflp->nfsfl_end - rflp->nfsfl_off; /* * For Flex File layout with mirrored DSs, select one * of them at random for reads. For writes and commits, * do all mirrors. */ m = NULL; tdrpc = drpc = NULL; firstmirror = 0; mirrorcnt = 1; if ((layp->nfsly_flags & NFSLY_FLEXFILE) != 0 && (mirrorcnt = rflp->nfsfl_mirrorcnt) > 1) { if (rwaccess == NFSV4OPEN_ACCESSREAD) { firstmirror = arc4random() % mirrorcnt; mirrorcnt = firstmirror + 1; } else { if (docommit == 0) { /* * Save values, so uiop can be * rolled back upon a write * error. */ offs = uiop->uio_offset; resid = uiop->uio_resid; iovbase = uiop->uio_iov->iov_base; iovlen = uiop->uio_iov->iov_len; m = nfsm_uiombuflist(uiop, len, 0); } tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP, M_WAITOK | M_ZERO); } } for (i = firstmirror; i < mirrorcnt && error == 0; i++){ m2 = NULL; if (m != NULL && i < mirrorcnt - 1) m2 = m_copym(m, 0, M_COPYALL, M_WAITOK); else { m2 = m; m = NULL; } if ((layp->nfsly_flags & NFSLY_FLEXFILE) != 0) { dev = rflp->nfsfl_ffm[i].dev; dip = nfscl_getdevinfo(nmp->nm_clp, dev, rflp->nfsfl_ffm[i].devp); } else { dev = rflp->nfsfl_dev; dip = nfscl_getdevinfo(nmp->nm_clp, dev, rflp->nfsfl_devp); } if (dip != NULL) { if ((rflp->nfsfl_flags & NFSFL_FLEXFILE) != 0) error = nfscl_dofflayoutio(vp, uiop, iomode, must_commit, &eof, &stateid, rwaccess, dip, layp, rflp, off, xfer, i, docommit, m2, tdrpc, newcred, p); else error = nfscl_doflayoutio(vp, uiop, iomode, must_commit, &eof, &stateid, rwaccess, dip, layp, rflp, off, xfer, docommit, newcred, p); nfscl_reldevinfo(dip); } else { if (m2 != NULL) m_freem(m2); error = EIO; } tdrpc++; } if (m != NULL) m_freem(m); tdrpc = drpc; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; for (i = firstmirror; i < mirrorcnt - 1 && tdrpc != NULL; i++, tdrpc++) { /* * For the unused drpc entries, both inprog and * err == 0, so this loop won't break. */ while (tdrpc->inprog != 0 && tdrpc->done == 0) tsleep(&tdrpc->tsk, PVFS, "clrpcio", timo); if (error == 0 && tdrpc->err != 0) error = tdrpc->err; } free(drpc, M_TEMP); if (error == 0) { if (mirrorcnt > 1 && rwaccess == NFSV4OPEN_ACCESSWRITE && docommit == 0) { NFSLOCKCLSTATE(); layp->nfsly_flags |= NFSLY_WRITTEN; NFSUNLOCKCLSTATE(); } lastbyte = off + xfer - 1; NFSLOCKCLSTATE(); if (lastbyte > layp->nfsly_lastbyte) layp->nfsly_lastbyte = lastbyte; NFSUNLOCKCLSTATE(); } else if (error == NFSERR_OPENMODE && rwaccess == NFSV4OPEN_ACCESSREAD) { NFSLOCKMNT(nmp); nmp->nm_state |= NFSSTA_OPENMODE; NFSUNLOCKMNT(nmp); } else error = EIO; if (error == 0) len -= (oresid - (uint64_t)uiop->uio_resid); else if (mirrorcnt > 1 && rwaccess == NFSV4OPEN_ACCESSWRITE && docommit == 0) { /* * In case the rpc gets retried, roll the * uio fields changed by nfsm_uiombuflist() * back. */ uiop->uio_offset = offs; uiop->uio_resid = resid; uiop->uio_iov->iov_base = iovbase; uiop->uio_iov->iov_len = iovlen; } } } if (lckp != NULL) nfscl_lockderef(lckp); NFSFREECRED(newcred); nfscl_rellayout(layp, 0); nfscl_relref(nmp); return (error); } /* * Find a file layout that will handle the first bytes of the requested * range and return the information from it needed to the I/O operation. */ int nfscl_findlayoutforio(struct nfscllayout *lyp, uint64_t off, uint32_t rwaccess, struct nfsclflayout **retflpp) { struct nfsclflayout *flp, *nflp, *rflp; uint32_t rw; rflp = NULL; rw = rwaccess; /* For reading, do the Read list first and then the Write list. */ do { if (rw == NFSV4OPEN_ACCESSREAD) flp = LIST_FIRST(&lyp->nfsly_flayread); else flp = LIST_FIRST(&lyp->nfsly_flayrw); while (flp != NULL) { nflp = LIST_NEXT(flp, nfsfl_list); if (flp->nfsfl_off > off) break; if (flp->nfsfl_end > off && (rflp == NULL || rflp->nfsfl_end < flp->nfsfl_end)) rflp = flp; flp = nflp; } if (rw == NFSV4OPEN_ACCESSREAD) rw = NFSV4OPEN_ACCESSWRITE; else rw = 0; } while (rw != 0); if (rflp != NULL) { /* This one covers the most bytes starting at off. */ *retflpp = rflp; return (0); } return (EIO); } /* * Do I/O using an NFSv4.1 or NFSv4.2 file layout. */ static int nfscl_doflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, int *eofp, nfsv4stateid_t *stateidp, int rwflag, struct nfscldevinfo *dp, struct nfscllayout *lyp, struct nfsclflayout *flp, uint64_t off, uint64_t len, int docommit, struct ucred *cred, NFSPROC_T *p) { uint64_t io_off, rel_off, stripe_unit_size, transfer, xfer; int commit_thru_mds, error, stripe_index, stripe_pos, minorvers; struct nfsnode *np; struct nfsfh *fhp; struct nfsclds **dspp; np = VTONFS(vp); rel_off = off - flp->nfsfl_patoff; stripe_unit_size = flp->nfsfl_util & NFSFLAYUTIL_STRIPE_MASK; stripe_pos = (rel_off / stripe_unit_size + flp->nfsfl_stripe1) % dp->nfsdi_stripecnt; transfer = stripe_unit_size - (rel_off % stripe_unit_size); error = 0; /* Loop around, doing I/O for each stripe unit. */ while (len > 0 && error == 0) { stripe_index = nfsfldi_stripeindex(dp, stripe_pos); dspp = nfsfldi_addr(dp, stripe_index); if (((*dspp)->nfsclds_flags & NFSCLDS_MINORV2) != 0) minorvers = NFSV42_MINORVERSION; else minorvers = NFSV41_MINORVERSION; if (len > transfer && docommit == 0) xfer = transfer; else xfer = len; if ((flp->nfsfl_util & NFSFLAYUTIL_DENSE) != 0) { /* Dense layout. */ if (stripe_pos >= flp->nfsfl_fhcnt) return (EIO); fhp = flp->nfsfl_fh[stripe_pos]; io_off = (rel_off / (stripe_unit_size * dp->nfsdi_stripecnt)) * stripe_unit_size + rel_off % stripe_unit_size; } else { /* Sparse layout. */ if (flp->nfsfl_fhcnt > 1) { if (stripe_index >= flp->nfsfl_fhcnt) return (EIO); fhp = flp->nfsfl_fh[stripe_index]; } else if (flp->nfsfl_fhcnt == 1) fhp = flp->nfsfl_fh[0]; else fhp = np->n_fhp; io_off = off; } if ((flp->nfsfl_util & NFSFLAYUTIL_COMMIT_THRU_MDS) != 0) { commit_thru_mds = 1; if (docommit != 0) error = EIO; } else { commit_thru_mds = 0; NFSLOCKNODE(np); np->n_flag |= NDSCOMMIT; NFSUNLOCKNODE(np); } if (docommit != 0) { if (error == 0) error = nfsrpc_commitds(vp, io_off, xfer, *dspp, fhp, NFS_VER4, minorvers, cred, p); if (error == 0) { /* * Set both eof and uio_resid = 0 to end any * loops. */ *eofp = 1; uiop->uio_resid = 0; } else { NFSLOCKNODE(np); np->n_flag &= ~NDSCOMMIT; NFSUNLOCKNODE(np); } } else if (rwflag == NFSV4OPEN_ACCESSREAD) error = nfsrpc_readds(vp, uiop, stateidp, eofp, *dspp, io_off, xfer, fhp, 0, NFS_VER4, minorvers, cred, p); else { error = nfsrpc_writeds(vp, uiop, iomode, must_commit, stateidp, *dspp, io_off, xfer, fhp, commit_thru_mds, 0, NFS_VER4, minorvers, cred, p); if (error == 0) { NFSLOCKCLSTATE(); lyp->nfsly_flags |= NFSLY_WRITTEN; NFSUNLOCKCLSTATE(); } } if (error == 0) { transfer = stripe_unit_size; stripe_pos = (stripe_pos + 1) % dp->nfsdi_stripecnt; len -= xfer; off += xfer; } } return (error); } /* * Do I/O using an NFSv4.1 flex file layout. */ static int nfscl_dofflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, int *eofp, nfsv4stateid_t *stateidp, int rwflag, struct nfscldevinfo *dp, struct nfscllayout *lyp, struct nfsclflayout *flp, uint64_t off, uint64_t len, int mirror, int docommit, struct mbuf *mp, struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p) { uint64_t xfer; int error; struct nfsnode *np; struct nfsfh *fhp; struct nfsclds **dspp; struct ucred *tcred; struct mbuf *m, *m2; uint32_t copylen; np = VTONFS(vp); error = 0; NFSCL_DEBUG(4, "nfscl_dofflayoutio: off=%ju len=%ju\n", (uintmax_t)off, (uintmax_t)len); /* Loop around, doing I/O for each stripe unit. */ while (len > 0 && error == 0) { dspp = nfsfldi_addr(dp, 0); fhp = flp->nfsfl_ffm[mirror].fh[dp->nfsdi_versindex]; stateidp = &flp->nfsfl_ffm[mirror].st; NFSCL_DEBUG(4, "mirror=%d vind=%d fhlen=%d st.seqid=0x%x\n", mirror, dp->nfsdi_versindex, fhp->nfh_len, stateidp->seqid); if ((dp->nfsdi_flags & NFSDI_TIGHTCOUPLED) == 0) { tcred = NFSNEWCRED(cred); tcred->cr_uid = flp->nfsfl_ffm[mirror].user; tcred->cr_groups[0] = flp->nfsfl_ffm[mirror].group; tcred->cr_ngroups = 1; } else tcred = cred; if (rwflag == NFSV4OPEN_ACCESSREAD) copylen = dp->nfsdi_rsize; else { copylen = dp->nfsdi_wsize; if (len > copylen && mp != NULL) { /* * When a mirrored configuration needs to do * multiple writes to each mirror, all writes * except the last one must be a multiple of * 4 bytes. This is required so that the XDR * does not need padding. * If possible, clip the size to an exact * multiple of the mbuf length, so that the * split will be on an mbuf boundary. */ copylen &= 0xfffffffc; if (copylen > mp->m_len) copylen = copylen / mp->m_len * mp->m_len; } } NFSLOCKNODE(np); np->n_flag |= NDSCOMMIT; NFSUNLOCKNODE(np); if (len > copylen && docommit == 0) xfer = copylen; else xfer = len; if (docommit != 0) { if (error == 0) { /* * Do last mirrored DS commit with this thread. */ if (mirror < flp->nfsfl_mirrorcnt - 1) error = nfsio_commitds(vp, off, xfer, *dspp, fhp, dp->nfsdi_vers, dp->nfsdi_minorvers, drpc, tcred, p); else error = nfsrpc_commitds(vp, off, xfer, *dspp, fhp, dp->nfsdi_vers, dp->nfsdi_minorvers, tcred, p); NFSCL_DEBUG(4, "commitds=%d\n", error); if (error != 0 && error != EACCES && error != ESTALE) { NFSCL_DEBUG(4, "DS layreterr for commit\n"); nfscl_dserr(NFSV4OP_COMMIT, error, dp, lyp, *dspp); } } NFSCL_DEBUG(4, "aft nfsio_commitds=%d\n", error); if (error == 0) { /* * Set both eof and uio_resid = 0 to end any * loops. */ *eofp = 1; uiop->uio_resid = 0; } else { NFSLOCKNODE(np); np->n_flag &= ~NDSCOMMIT; NFSUNLOCKNODE(np); } } else if (rwflag == NFSV4OPEN_ACCESSREAD) { error = nfsrpc_readds(vp, uiop, stateidp, eofp, *dspp, off, xfer, fhp, 1, dp->nfsdi_vers, dp->nfsdi_minorvers, tcred, p); NFSCL_DEBUG(4, "readds=%d\n", error); if (error != 0 && error != EACCES && error != ESTALE) { NFSCL_DEBUG(4, "DS layreterr for read\n"); nfscl_dserr(NFSV4OP_READ, error, dp, lyp, *dspp); } } else { if (flp->nfsfl_mirrorcnt == 1) { error = nfsrpc_writeds(vp, uiop, iomode, must_commit, stateidp, *dspp, off, xfer, fhp, 0, 1, dp->nfsdi_vers, dp->nfsdi_minorvers, tcred, p); if (error == 0) { NFSLOCKCLSTATE(); lyp->nfsly_flags |= NFSLY_WRITTEN; NFSUNLOCKCLSTATE(); } } else { m = mp; if (xfer < len) { /* The mbuf list must be split. */ m2 = nfsm_split(mp, xfer); if (m2 != NULL) mp = m2; else { m_freem(mp); error = EIO; } } NFSCL_DEBUG(4, "mcopy len=%jd xfer=%jd\n", (uintmax_t)len, (uintmax_t)xfer); /* * Do last write to a mirrored DS with this * thread. */ if (error == 0) { if (mirror < flp->nfsfl_mirrorcnt - 1) error = nfsio_writedsmir(vp, iomode, must_commit, stateidp, *dspp, off, xfer, fhp, m, dp->nfsdi_vers, dp->nfsdi_minorvers, drpc, tcred, p); else error = nfsrpc_writedsmir(vp, iomode, must_commit, stateidp, *dspp, off, xfer, fhp, m, dp->nfsdi_vers, dp->nfsdi_minorvers, tcred, p); } NFSCL_DEBUG(4, "nfsio_writedsmir=%d\n", error); if (error != 0 && error != EACCES && error != ESTALE) { NFSCL_DEBUG(4, "DS layreterr for write\n"); nfscl_dserr(NFSV4OP_WRITE, error, dp, lyp, *dspp); } } } NFSCL_DEBUG(4, "aft read/writeds=%d\n", error); if (error == 0) { len -= xfer; off += xfer; } if ((dp->nfsdi_flags & NFSDI_TIGHTCOUPLED) == 0) NFSFREECRED(tcred); } NFSCL_DEBUG(4, "eo nfscl_dofflayoutio=%d\n", error); return (error); } /* * The actual read RPC done to a DS. */ static int nfsrpc_readds(vnode_t vp, struct uio *uiop, nfsv4stateid_t *stateidp, int *eofp, struct nfsclds *dsp, uint64_t io_off, int len, struct nfsfh *fhp, int flex, int vers, int minorvers, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int attrflag, error, retlen; struct nfsrv_descript nfsd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsrv_descript *nd = &nfsd; struct nfssockreq *nrp; struct nfsvattr na; nd->nd_mrep = NULL; if (vers == 0 || vers == NFS_VER4) { nfscl_reqstart(nd, NFSPROC_READDS, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); vers = NFS_VER4; NFSCL_DEBUG(4, "nfsrpc_readds: vers4 minvers=%d\n", minorvers); if (flex != 0) nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); else nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSEQIDZERO); } else { nfscl_reqstart(nd, NFSPROC_READ, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_READ]); NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_READDS]); NFSCL_DEBUG(4, "nfsrpc_readds: vers3\n"); } NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED * 3); txdr_hyper(io_off, tl); *(tl + 2) = txdr_unsigned(len); nrp = dsp->nfsclds_sockp; NFSCL_DEBUG(4, "nfsrpc_readds: nrp=%p\n", nrp); if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess); NFSCL_DEBUG(4, "nfsrpc_readds: stat=%d err=%d\n", nd->nd_repstat, error); if (error != 0) return (error); if (vers == NFS_VER3) { error = nfscl_postop_attr(nd, &na, &attrflag, NULL); NFSCL_DEBUG(4, "nfsrpc_readds: postop=%d\n", error); if (error != 0) goto nfsmout; } if (nd->nd_repstat != 0) { error = nd->nd_repstat; goto nfsmout; } if (vers == NFS_VER3) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); *eofp = fxdr_unsigned(int, *(tl + 1)); } else { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); *eofp = fxdr_unsigned(int, *tl); } NFSM_STRSIZ(retlen, len); NFSCL_DEBUG(4, "nfsrpc_readds: retlen=%d eof=%d\n", retlen, *eofp); error = nfsm_mbufuio(nd, uiop, retlen); nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); return (error); } /* * The actual write RPC done to a DS. */ static int nfsrpc_writeds(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, nfsv4stateid_t *stateidp, struct nfsclds *dsp, uint64_t io_off, int len, struct nfsfh *fhp, int commit_thru_mds, int flex, int vers, int minorvers, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int attrflag, error, rlen, commit, committed = NFSWRITE_FILESYNC; int32_t backup; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct nfssockreq *nrp; struct nfsvattr na; KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1")); nd->nd_mrep = NULL; if (vers == 0 || vers == NFS_VER4) { nfscl_reqstart(nd, NFSPROC_WRITEDS, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); NFSCL_DEBUG(4, "nfsrpc_writeds: vers4 minvers=%d\n", minorvers); vers = NFS_VER4; if (flex != 0) nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); else nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSEQIDZERO); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); } else { nfscl_reqstart(nd, NFSPROC_WRITE, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITE]); NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITEDS]); NFSCL_DEBUG(4, "nfsrpc_writeds: vers3\n"); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 3 * NFSX_UNSIGNED); } txdr_hyper(io_off, tl); tl += 2; if (vers == NFS_VER3) *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); nfsm_uiombuf(nd, uiop, len); nrp = dsp->nfsclds_sockp; if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess); NFSCL_DEBUG(4, "nfsrpc_writeds: err=%d stat=%d\n", error, nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat != 0) { /* * In case the rpc gets retried, roll * the uio fields changed by nfsm_uiombuf() * back. */ uiop->uio_offset -= len; uiop->uio_resid += len; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - len; uiop->uio_iov->iov_len += len; error = nd->nd_repstat; } else { if (vers == NFS_VER3) { error = nfscl_wcc_data(nd, vp, &na, &attrflag, NULL, NULL); NFSCL_DEBUG(4, "nfsrpc_writeds: wcc_data=%d\n", error); if (error != 0) goto nfsmout; } NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); rlen = fxdr_unsigned(int, *tl++); NFSCL_DEBUG(4, "nfsrpc_writeds: len=%d rlen=%d\n", len, rlen); if (rlen == 0) { error = NFSERR_IO; goto nfsmout; } else if (rlen < len) { backup = len - rlen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup; uiop->uio_iov->iov_len += backup; uiop->uio_offset -= backup; uiop->uio_resid += backup; len = rlen; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest commitment level * obtained by any of the RPCs. */ if (committed == NFSWRITE_FILESYNC) committed = commit; else if (committed == NFSWRITE_DATASYNC && commit == NFSWRITE_UNSTABLE) committed = commit; if (commit_thru_mds != 0) { NFSLOCKMNT(nmp); if (!NFSHASWRITEVERF(nmp)) { NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); NFSSETWRITEVERF(nmp); } else if (NFSBCMP(tl, nmp->nm_verf, NFSX_VERF)) { *must_commit = 1; NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); } NFSUNLOCKMNT(nmp); } else { NFSLOCKDS(dsp); if ((dsp->nfsclds_flags & NFSCLDS_HASWRITEVERF) == 0) { NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF); dsp->nfsclds_flags |= NFSCLDS_HASWRITEVERF; } else if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF)) { *must_commit = 1; NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF); } NFSUNLOCKDS(dsp); } } nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); *iomode = committed; if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; return (error); } /* * The actual write RPC done to a DS. * This variant is called from a separate kernel process for mirrors. * Any short write is considered an IO error. */ static int nfsrpc_writedsmir(vnode_t vp, int *iomode, int *must_commit, nfsv4stateid_t *stateidp, struct nfsclds *dsp, uint64_t io_off, int len, struct nfsfh *fhp, struct mbuf *m, int vers, int minorvers, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int attrflag, error, commit, committed = NFSWRITE_FILESYNC, rlen; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct nfssockreq *nrp; struct nfsvattr na; nd->nd_mrep = NULL; if (vers == 0 || vers == NFS_VER4) { nfscl_reqstart(nd, NFSPROC_WRITEDS, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); vers = NFS_VER4; NFSCL_DEBUG(4, "nfsrpc_writedsmir: vers4 minvers=%d\n", minorvers); nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); } else { nfscl_reqstart(nd, NFSPROC_WRITE, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITE]); NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITEDS]); NFSCL_DEBUG(4, "nfsrpc_writedsmir: vers3\n"); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 3 * NFSX_UNSIGNED); } txdr_hyper(io_off, tl); tl += 2; if (vers == NFS_VER3) *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); if (len > 0) { /* Put data in mbuf chain. */ nd->nd_mb->m_next = m; } nrp = dsp->nfsclds_sockp; if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess); NFSCL_DEBUG(4, "nfsrpc_writedsmir: err=%d stat=%d\n", error, nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat != 0) error = nd->nd_repstat; else { if (vers == NFS_VER3) { error = nfscl_wcc_data(nd, vp, &na, &attrflag, NULL, NULL); NFSCL_DEBUG(4, "nfsrpc_writedsmir: wcc_data=%d\n", error); if (error != 0) goto nfsmout; } NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); rlen = fxdr_unsigned(int, *tl++); NFSCL_DEBUG(4, "nfsrpc_writedsmir: len=%d rlen=%d\n", len, rlen); if (rlen != len) { error = NFSERR_IO; NFSCL_DEBUG(4, "nfsrpc_writedsmir: len=%d rlen=%d\n", len, rlen); goto nfsmout; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest commitment level * obtained by any of the RPCs. */ if (committed == NFSWRITE_FILESYNC) committed = commit; else if (committed == NFSWRITE_DATASYNC && commit == NFSWRITE_UNSTABLE) committed = commit; NFSLOCKDS(dsp); if ((dsp->nfsclds_flags & NFSCLDS_HASWRITEVERF) == 0) { NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF); dsp->nfsclds_flags |= NFSCLDS_HASWRITEVERF; } else if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF)) { *must_commit = 1; NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF); } NFSUNLOCKDS(dsp); } nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); *iomode = committed; if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; return (error); } /* * Start up the thread that will execute nfsrpc_writedsmir(). */ static void start_writedsmir(void *arg, int pending) { struct nfsclwritedsdorpc *drpc; drpc = (struct nfsclwritedsdorpc *)arg; drpc->err = nfsrpc_writedsmir(drpc->vp, &drpc->iomode, &drpc->must_commit, drpc->stateidp, drpc->dsp, drpc->off, drpc->len, drpc->fhp, drpc->m, drpc->vers, drpc->minorvers, drpc->cred, drpc->p); drpc->done = 1; NFSCL_DEBUG(4, "start_writedsmir: err=%d\n", drpc->err); } /* * Set up the write DS mirror call for the pNFS I/O thread. */ static int nfsio_writedsmir(vnode_t vp, int *iomode, int *must_commit, nfsv4stateid_t *stateidp, struct nfsclds *dsp, uint64_t off, int len, struct nfsfh *fhp, struct mbuf *m, int vers, int minorvers, struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p) { int error, ret; error = 0; drpc->done = 0; drpc->vp = vp; drpc->iomode = *iomode; drpc->must_commit = *must_commit; drpc->stateidp = stateidp; drpc->dsp = dsp; drpc->off = off; drpc->len = len; drpc->fhp = fhp; drpc->m = m; drpc->vers = vers; drpc->minorvers = minorvers; drpc->cred = cred; drpc->p = p; drpc->inprog = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_writedsmir, drpc); NFSCL_DEBUG(4, "nfsio_writedsmir: nfs_pnfsio=%d\n", ret); } if (ret != 0) error = nfsrpc_writedsmir(vp, iomode, must_commit, stateidp, dsp, off, len, fhp, m, vers, minorvers, cred, p); NFSCL_DEBUG(4, "nfsio_writedsmir: error=%d\n", error); return (error); } /* * Free up the nfsclds structure. */ void nfscl_freenfsclds(struct nfsclds *dsp) { int i; if (dsp == NULL) return; if (dsp->nfsclds_sockp != NULL) { NFSFREECRED(dsp->nfsclds_sockp->nr_cred); NFSFREEMUTEX(&dsp->nfsclds_sockp->nr_mtx); free(dsp->nfsclds_sockp->nr_nam, M_SONAME); free(dsp->nfsclds_sockp, M_NFSSOCKREQ); } NFSFREEMUTEX(&dsp->nfsclds_mtx); NFSFREEMUTEX(&dsp->nfsclds_sess.nfsess_mtx); for (i = 0; i < NFSV4_CBSLOTS; i++) { if (dsp->nfsclds_sess.nfsess_cbslots[i].nfssl_reply != NULL) m_freem( dsp->nfsclds_sess.nfsess_cbslots[i].nfssl_reply); } free(dsp, M_NFSCLDS); } static enum nfsclds_state nfscl_getsameserver(struct nfsmount *nmp, struct nfsclds *newdsp, struct nfsclds **retdspp, uint32_t *sequencep) { struct nfsclds *dsp; int fndseq; /* * Search the list of nfsclds structures for one with the same * server. */ fndseq = 0; TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) { if (dsp->nfsclds_servownlen == newdsp->nfsclds_servownlen && dsp->nfsclds_servownlen != 0 && !NFSBCMP(dsp->nfsclds_serverown, newdsp->nfsclds_serverown, dsp->nfsclds_servownlen) && dsp->nfsclds_sess.nfsess_defunct == 0) { NFSCL_DEBUG(4, "fnd same fdsp=%p dsp=%p flg=0x%x\n", TAILQ_FIRST(&nmp->nm_sess), dsp, dsp->nfsclds_flags); if (fndseq == 0) { /* Get sequenceid# from first entry. */ *sequencep = dsp->nfsclds_sess.nfsess_sequenceid; fndseq = 1; } /* Server major id matches. */ if ((dsp->nfsclds_flags & NFSCLDS_DS) != 0) { *retdspp = dsp; return (NFSDSP_USETHISSESSION); } } } if (fndseq != 0) return (NFSDSP_SEQTHISSESSION); return (NFSDSP_NOTFOUND); } /* * NFS commit rpc to a NFSv4.1 DS. */ static int nfsrpc_commitds(vnode_t vp, uint64_t offset, int cnt, struct nfsclds *dsp, struct nfsfh *fhp, int vers, int minorvers, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfssockreq *nrp; struct nfsvattr na; int attrflag, error; nd->nd_mrep = NULL; if (vers == 0 || vers == NFS_VER4) { nfscl_reqstart(nd, NFSPROC_COMMITDS, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); vers = NFS_VER4; } else { nfscl_reqstart(nd, NFSPROC_COMMIT, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_COMMIT]); NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_COMMITDS]); } NFSCL_DEBUG(4, "nfsrpc_commitds: vers=%d minvers=%d\n", vers, minorvers); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); nrp = dsp->nfsclds_sockp; if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess); NFSCL_DEBUG(4, "nfsrpc_commitds: err=%d stat=%d\n", error, nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat == 0) { if (vers == NFS_VER3) { error = nfscl_wcc_data(nd, vp, &na, &attrflag, NULL, NULL); NFSCL_DEBUG(4, "nfsrpc_commitds: wccdata=%d\n", error); if (error != 0) goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF); NFSLOCKDS(dsp); if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF)) { NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF); error = NFSERR_STALEWRITEVERF; } NFSUNLOCKDS(dsp); } nfsmout: if (error == 0 && nd->nd_repstat != 0) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Start up the thread that will execute nfsrpc_commitds(). */ static void start_commitds(void *arg, int pending) { struct nfsclwritedsdorpc *drpc; drpc = (struct nfsclwritedsdorpc *)arg; drpc->err = nfsrpc_commitds(drpc->vp, drpc->off, drpc->len, drpc->dsp, drpc->fhp, drpc->vers, drpc->minorvers, drpc->cred, drpc->p); drpc->done = 1; NFSCL_DEBUG(4, "start_commitds: err=%d\n", drpc->err); } /* * Set up the commit DS mirror call for the pNFS I/O thread. */ static int nfsio_commitds(vnode_t vp, uint64_t offset, int cnt, struct nfsclds *dsp, struct nfsfh *fhp, int vers, int minorvers, struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p) { int error, ret; error = 0; drpc->done = 0; drpc->vp = vp; drpc->off = offset; drpc->len = cnt; drpc->dsp = dsp; drpc->fhp = fhp; drpc->vers = vers; drpc->minorvers = minorvers; drpc->cred = cred; drpc->p = p; drpc->inprog = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_commitds, drpc); NFSCL_DEBUG(4, "nfsio_commitds: nfs_pnfsio=%d\n", ret); } if (ret != 0) error = nfsrpc_commitds(vp, offset, cnt, dsp, fhp, vers, minorvers, cred, p); NFSCL_DEBUG(4, "nfsio_commitds: error=%d\n", error); return (error); } /* * NFS Advise rpc */ int nfsrpc_advise(vnode_t vp, off_t offset, uint64_t cnt, int advise, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t hints; int error; NFSZERO_ATTRBIT(&hints); if (advise == POSIX_FADV_WILLNEED) NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_WILLNEED); else if (advise == POSIX_FADV_DONTNEED) NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_DONTNEED); else return (0); NFSCL_REQSTART(nd, NFSPROC_IOADVISE, vp); nfsm_stateidtom(nd, NULL, NFSSTATEID_PUTALLZERO); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER); txdr_hyper(offset, tl); tl += 2; txdr_hyper(cnt, tl); nfsrv_putattrbit(nd, &hints); error = nfscl_request(nd, vp, p, cred, NULL); if (error != 0) return (error); if (nd->nd_repstat != 0) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } #ifdef notyet /* * NFS advise rpc to a NFSv4.2 DS. */ static int nfsrpc_adviseds(vnode_t vp, uint64_t offset, int cnt, int advise, struct nfsclds *dsp, struct nfsfh *fhp, int vers, int minorvers, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfssockreq *nrp; nfsattrbit_t hints; int error; /* For NFS DSs prior to NFSv4.2, just return OK. */ if (vers == NFS_VER3 || minorversion < NFSV42_MINORVERSION) return (0); NFSZERO_ATTRBIT(&hints); if (advise == POSIX_FADV_WILLNEED) NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_WILLNEED); else if (advise == POSIX_FADV_DONTNEED) NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_DONTNEED); else return (0); nd->nd_mrep = NULL; nfscl_reqstart(nd, NFSPROC_IOADVISEDS, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); vers = NFS_VER4; NFSCL_DEBUG(4, "nfsrpc_adviseds: vers=%d minvers=%d\n", vers, minorvers); nfsm_stateidtom(nd, NULL, NFSSTATEID_PUTALLZERO); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); nfsrv_putattrbit(nd, &hints); nrp = dsp->nfsclds_sockp; if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess); NFSCL_DEBUG(4, "nfsrpc_adviseds: err=%d stat=%d\n", error, nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat != 0) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Start up the thread that will execute nfsrpc_commitds(). */ static void start_adviseds(void *arg, int pending) { struct nfsclwritedsdorpc *drpc; drpc = (struct nfsclwritedsdorpc *)arg; drpc->err = nfsrpc_adviseds(drpc->vp, drpc->off, drpc->len, drpc->advise, drpc->dsp, drpc->fhp, drpc->vers, drpc->minorvers, drpc->cred, drpc->p); drpc->done = 1; NFSCL_DEBUG(4, "start_adviseds: err=%d\n", drpc->err); } /* * Set up the commit DS mirror call for the pNFS I/O thread. */ static int nfsio_adviseds(vnode_t vp, uint64_t offset, int cnt, int advise, struct nfsclds *dsp, struct nfsfh *fhp, int vers, int minorvers, struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p) { int error, ret; error = 0; drpc->done = 0; drpc->vp = vp; drpc->off = offset; drpc->len = cnt; drpc->advise = advise; drpc->dsp = dsp; drpc->fhp = fhp; drpc->vers = vers; drpc->minorvers = minorvers; drpc->cred = cred; drpc->p = p; drpc->inprog = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_adviseds, drpc); NFSCL_DEBUG(4, "nfsio_adviseds: nfs_pnfsio=%d\n", ret); } if (ret != 0) error = nfsrpc_adviseds(vp, offset, cnt, advise, dsp, fhp, vers, minorvers, cred, p); NFSCL_DEBUG(4, "nfsio_adviseds: error=%d\n", error); return (error); } #endif /* notyet */ /* * Do the Allocate operation, retrying for recovery. */ int nfsrpc_allocate(vnode_t vp, off_t off, off_t len, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p, void *stuff) { int error, expireret = 0, retrycnt, nostateid; uint32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsfh *nfhp = NULL; nfsv4stateid_t stateid; off_t tmp_off; void *lckp; if (len < 0) return (EINVAL); if (len == 0) return (0); tmp_off = off + len; NFSLOCKMNT(nmp); if (tmp_off > nmp->nm_maxfilesize || tmp_off < off) { NFSUNLOCKMNT(nmp); return (EFBIG); } if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; NFSUNLOCKMNT(nmp); nfhp = VTONFS(vp)->n_fhp; retrycnt = 0; do { lckp = NULL; nostateid = 0; nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, NFSV4OPEN_ACCESSWRITE, 0, cred, p, &stateid, &lckp); if (stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { nostateid = 1; NFSCL_DEBUG(1, "stateid0 in allocate\n"); } /* * Not finding a stateid should probably never happen, * but just return an error for this case. */ if (nostateid != 0) error = EIO; else error = nfsrpc_allocaterpc(vp, off, len, &stateid, nap, attrflagp, cred, p, stuff); if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_allocate"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_DELAY || error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || error == NFSERR_STALEDONTRECOVER || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error != 0 && retrycnt >= 4) error = EIO; return (error); } /* * The allocate RPC. */ static int nfsrpc_allocaterpc(vnode_t vp, off_t off, off_t len, nfsv4stateid_t *stateidp, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p, void *stuff) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_ALLOCATE, vp); nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_UNSIGNED); txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); tl += 2; *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, stuff); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = NFS_LATTR_NOSHRINK; } else error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Set up the XDR arguments for the LayoutGet operation. */ static void nfsrv_setuplayoutget(struct nfsrv_descript *nd, int iomode, uint64_t offset, uint64_t len, uint64_t minlen, nfsv4stateid_t *stateidp, int layouttype, int layoutlen, int usecurstateid) { uint32_t *tl; NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED + 3 * NFSX_HYPER + NFSX_STATEID); *tl++ = newnfs_false; /* Don't signal availability. */ *tl++ = txdr_unsigned(layouttype); *tl++ = txdr_unsigned(iomode); txdr_hyper(offset, tl); tl += 2; txdr_hyper(len, tl); tl += 2; txdr_hyper(minlen, tl); tl += 2; if (usecurstateid != 0) { /* Special stateid for Current stateid. */ *tl++ = txdr_unsigned(1); *tl++ = 0; *tl++ = 0; *tl++ = 0; } else { *tl++ = txdr_unsigned(stateidp->seqid); NFSCL_DEBUG(4, "layget seq=%d\n", (int)stateidp->seqid); *tl++ = stateidp->other[0]; *tl++ = stateidp->other[1]; *tl++ = stateidp->other[2]; } *tl = txdr_unsigned(layoutlen); } /* * Parse the reply for a successful LayoutGet operation. */ static int nfsrv_parselayoutget(struct nfsmount *nmp, struct nfsrv_descript *nd, nfsv4stateid_t *stateidp, int *retonclosep, struct nfsclflayouthead *flhp) { uint32_t *tl; struct nfsclflayout *flp, *prevflp, *tflp; int cnt, error, fhcnt, gotiomode, i, iomode, j, k, l, laytype, nfhlen; int m, mirrorcnt; uint64_t retlen, off; struct nfsfh *nfhp; uint8_t *cp; uid_t user; gid_t grp; NFSCL_DEBUG(4, "in nfsrv_parselayoutget\n"); error = 0; flp = NULL; gotiomode = -1; NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_STATEID); if (*tl++ != 0) *retonclosep = 1; else *retonclosep = 0; stateidp->seqid = fxdr_unsigned(uint32_t, *tl++); NFSCL_DEBUG(4, "retoncls=%d stseq=%d\n", *retonclosep, (int)stateidp->seqid); stateidp->other[0] = *tl++; stateidp->other[1] = *tl++; stateidp->other[2] = *tl++; cnt = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "layg cnt=%d\n", cnt); if (cnt <= 0 || cnt > 10000) { /* Don't accept more than 10000 layouts in reply. */ error = NFSERR_BADXDR; goto nfsmout; } for (i = 0; i < cnt; i++) { /* Dissect to the layout type. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER + 3 * NFSX_UNSIGNED); off = fxdr_hyper(tl); tl += 2; retlen = fxdr_hyper(tl); tl += 2; iomode = fxdr_unsigned(int, *tl++); laytype = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "layt=%d off=%ju len=%ju iom=%d\n", laytype, (uintmax_t)off, (uintmax_t)retlen, iomode); /* Ignore length of layout body for now. */ if (laytype == NFSLAYOUT_NFSV4_1_FILES) { /* Parse the File layout up to fhcnt. */ NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED + NFSX_HYPER + NFSX_V4DEVICEID); fhcnt = fxdr_unsigned(int, *(tl + 4 + NFSX_V4DEVICEID / NFSX_UNSIGNED)); NFSCL_DEBUG(4, "fhcnt=%d\n", fhcnt); if (fhcnt < 0 || fhcnt > 100) { /* Don't accept more than 100 file handles. */ error = NFSERR_BADXDR; goto nfsmout; } if (fhcnt > 0) flp = malloc(sizeof(*flp) + fhcnt * sizeof(struct nfsfh *), M_NFSFLAYOUT, M_WAITOK); else flp = malloc(sizeof(*flp), M_NFSFLAYOUT, M_WAITOK); flp->nfsfl_flags = NFSFL_FILE; flp->nfsfl_fhcnt = 0; flp->nfsfl_devp = NULL; flp->nfsfl_off = off; if (flp->nfsfl_off + retlen < flp->nfsfl_off) flp->nfsfl_end = UINT64_MAX - flp->nfsfl_off; else flp->nfsfl_end = flp->nfsfl_off + retlen; flp->nfsfl_iomode = iomode; if (gotiomode == -1) gotiomode = flp->nfsfl_iomode; /* Ignore layout body length for now. */ NFSBCOPY(tl, flp->nfsfl_dev, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); flp->nfsfl_util = fxdr_unsigned(uint32_t, *tl++); NFSCL_DEBUG(4, "flutil=0x%x\n", flp->nfsfl_util); mtx_lock(&nmp->nm_mtx); if (nmp->nm_minorvers > 1 && (flp->nfsfl_util & NFSFLAYUTIL_IOADVISE_THRU_MDS) != 0) nmp->nm_privflag |= NFSMNTP_IOADVISETHRUMDS; mtx_unlock(&nmp->nm_mtx); flp->nfsfl_stripe1 = fxdr_unsigned(uint32_t, *tl++); flp->nfsfl_patoff = fxdr_hyper(tl); tl += 2; NFSCL_DEBUG(4, "stripe1=%u poff=%ju\n", flp->nfsfl_stripe1, (uintmax_t)flp->nfsfl_patoff); for (j = 0; j < fhcnt; j++) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); nfhlen = fxdr_unsigned(int, *tl); if (nfhlen <= 0 || nfhlen > NFSX_V4FHMAX) { error = NFSERR_BADXDR; goto nfsmout; } nfhp = malloc(sizeof(*nfhp) + nfhlen - 1, M_NFSFH, M_WAITOK); flp->nfsfl_fh[j] = nfhp; flp->nfsfl_fhcnt++; nfhp->nfh_len = nfhlen; NFSM_DISSECT(cp, uint8_t *, NFSM_RNDUP(nfhlen)); NFSBCOPY(cp, nfhp->nfh_fh, nfhlen); } } else if (laytype == NFSLAYOUT_FLEXFILE) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED + NFSX_HYPER); mirrorcnt = fxdr_unsigned(int, *(tl + 2)); NFSCL_DEBUG(4, "mirrorcnt=%d\n", mirrorcnt); if (mirrorcnt < 1 || mirrorcnt > NFSDEV_MAXMIRRORS) { error = NFSERR_BADXDR; goto nfsmout; } flp = malloc(sizeof(*flp) + mirrorcnt * sizeof(struct nfsffm), M_NFSFLAYOUT, M_WAITOK); flp->nfsfl_flags = NFSFL_FLEXFILE; flp->nfsfl_mirrorcnt = mirrorcnt; for (j = 0; j < mirrorcnt; j++) flp->nfsfl_ffm[j].devp = NULL; flp->nfsfl_off = off; if (flp->nfsfl_off + retlen < flp->nfsfl_off) flp->nfsfl_end = UINT64_MAX - flp->nfsfl_off; else flp->nfsfl_end = flp->nfsfl_off + retlen; flp->nfsfl_iomode = iomode; if (gotiomode == -1) gotiomode = flp->nfsfl_iomode; flp->nfsfl_stripeunit = fxdr_hyper(tl); NFSCL_DEBUG(4, "stripeunit=%ju\n", (uintmax_t)flp->nfsfl_stripeunit); for (j = 0; j < mirrorcnt; j++) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); k = fxdr_unsigned(int, *tl); if (k < 1 || k > 128) { error = NFSERR_BADXDR; goto nfsmout; } NFSCL_DEBUG(4, "servercnt=%d\n", k); for (l = 0; l < k; l++) { NFSM_DISSECT(tl, uint32_t *, NFSX_V4DEVICEID + NFSX_STATEID + 2 * NFSX_UNSIGNED); if (l == 0) { /* Just use the first server. */ NFSBCOPY(tl, flp->nfsfl_ffm[j].dev, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); tl++; flp->nfsfl_ffm[j].st.seqid = *tl++; flp->nfsfl_ffm[j].st.other[0] = *tl++; flp->nfsfl_ffm[j].st.other[1] = *tl++; flp->nfsfl_ffm[j].st.other[2] = *tl++; NFSCL_DEBUG(4, "st.seqid=%u " "st.o0=0x%x st.o1=0x%x " "st.o2=0x%x\n", flp->nfsfl_ffm[j].st.seqid, flp->nfsfl_ffm[j].st.other[0], flp->nfsfl_ffm[j].st.other[1], flp->nfsfl_ffm[j].st.other[2]); } else tl += ((NFSX_V4DEVICEID + NFSX_STATEID + NFSX_UNSIGNED) / NFSX_UNSIGNED); fhcnt = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "fhcnt=%d\n", fhcnt); if (fhcnt < 1 || fhcnt > NFSDEV_MAXVERS) { error = NFSERR_BADXDR; goto nfsmout; } for (m = 0; m < fhcnt; m++) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); nfhlen = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "nfhlen=%d\n", nfhlen); if (nfhlen <= 0 || nfhlen > NFSX_V4FHMAX) { error = NFSERR_BADXDR; goto nfsmout; } NFSM_DISSECT(cp, uint8_t *, NFSM_RNDUP(nfhlen)); if (l == 0) { flp->nfsfl_ffm[j].fhcnt = fhcnt; nfhp = malloc( sizeof(*nfhp) + nfhlen - 1, M_NFSFH, M_WAITOK); flp->nfsfl_ffm[j].fh[m] = nfhp; nfhp->nfh_len = nfhlen; NFSBCOPY(cp, nfhp->nfh_fh, nfhlen); NFSCL_DEBUG(4, "got fh\n"); } } /* Now, get the ffsd_user/ffds_group. */ error = nfsrv_parseug(nd, 0, &user, &grp, curthread); NFSCL_DEBUG(4, "after parseu=%d\n", error); if (error == 0) error = nfsrv_parseug(nd, 1, &user, &grp, curthread); NFSCL_DEBUG(4, "aft parseg=%d\n", grp); if (error != 0) goto nfsmout; NFSCL_DEBUG(4, "user=%d group=%d\n", user, grp); if (l == 0) { flp->nfsfl_ffm[j].user = user; flp->nfsfl_ffm[j].group = grp; NFSCL_DEBUG(4, "usr=%d grp=%d\n", user, grp); } } } NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); flp->nfsfl_fflags = fxdr_unsigned(uint32_t, *tl++); #ifdef notnow /* * At this time, there is no flag. * NFSFLEXFLAG_IOADVISE_THRU_MDS might need to be * added, or it may never exist? */ mtx_lock(&nmp->nm_mtx); if (nmp->nm_minorvers > 1 && (flp->nfsfl_fflags & NFSFLEXFLAG_IOADVISE_THRU_MDS) != 0) nmp->nm_privflag |= NFSMNTP_IOADVISETHRUMDS; mtx_unlock(&nmp->nm_mtx); #endif flp->nfsfl_statshint = fxdr_unsigned(uint32_t, *tl); NFSCL_DEBUG(4, "fflags=0x%x statshint=%d\n", flp->nfsfl_fflags, flp->nfsfl_statshint); } else { error = NFSERR_BADXDR; goto nfsmout; } if (flp->nfsfl_iomode == gotiomode) { /* Keep the list in increasing offset order. */ tflp = LIST_FIRST(flhp); prevflp = NULL; while (tflp != NULL && tflp->nfsfl_off < flp->nfsfl_off) { prevflp = tflp; tflp = LIST_NEXT(tflp, nfsfl_list); } if (prevflp == NULL) LIST_INSERT_HEAD(flhp, flp, nfsfl_list); else LIST_INSERT_AFTER(prevflp, flp, nfsfl_list); NFSCL_DEBUG(4, "flp inserted\n"); } else { printf("nfscl_layoutget(): got wrong iomode\n"); nfscl_freeflayout(flp); } flp = NULL; } nfsmout: NFSCL_DEBUG(4, "eo nfsrv_parselayoutget=%d\n", error); if (error != 0 && flp != NULL) nfscl_freeflayout(flp); return (error); } /* * Parse a user/group digit string. */ static int nfsrv_parseug(struct nfsrv_descript *nd, int dogrp, uid_t *uidp, gid_t *gidp, NFSPROC_T *p) { uint32_t *tl; char *cp, *str, str0[NFSV4_SMALLSTR + 1]; uint32_t len = 0; int error = 0; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(uint32_t, *tl); str = NULL; if (len > NFSV4_OPAQUELIMIT) { error = NFSERR_BADXDR; goto nfsmout; } NFSCL_DEBUG(4, "nfsrv_parseug: len=%d\n", len); if (len == 0) { if (dogrp != 0) *gidp = GID_NOGROUP; else *uidp = UID_NOBODY; return (0); } if (len > NFSV4_SMALLSTR) str = malloc(len + 1, M_TEMP, M_WAITOK); else str = str0; NFSM_DISSECT(cp, char *, NFSM_RNDUP(len)); NFSBCOPY(cp, str, len); str[len] = '\0'; NFSCL_DEBUG(4, "nfsrv_parseug: str=%s\n", str); if (dogrp != 0) error = nfsv4_strtogid(nd, str, len, gidp); else error = nfsv4_strtouid(nd, str, len, uidp); nfsmout: if (len > NFSV4_SMALLSTR) free(str, M_TEMP); NFSCL_DEBUG(4, "eo nfsrv_parseug=%d\n", error); return (error); } /* * Similar to nfsrpc_getlayout(), except that it uses nfsrpc_openlayget(), * so that it does both an Open and a Layoutget. */ static int nfsrpc_getopenlayout(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, uint8_t *newfhp, int newfhlen, uint32_t mode, struct nfsclopen *op, uint8_t *name, int namelen, struct nfscldeleg **dpp, struct ucred *cred, NFSPROC_T *p) { struct nfscllayout *lyp; struct nfsclflayout *flp; struct nfsclflayouthead flh; int error, islocked, layoutlen, recalled, retonclose, usecurstateid; int layouttype, laystat; nfsv4stateid_t stateid; struct nfsclsession *tsep; error = 0; if (NFSHASFLEXFILE(nmp)) layouttype = NFSLAYOUT_FLEXFILE; else layouttype = NFSLAYOUT_NFSV4_1_FILES; /* * If lyp is returned non-NULL, there will be a refcnt (shared lock) * on it, iff flp != NULL or a lock (exclusive lock) on it iff * flp == NULL. */ lyp = nfscl_getlayout(nmp->nm_clp, newfhp, newfhlen, 0, mode, &flp, &recalled); NFSCL_DEBUG(4, "nfsrpc_getopenlayout nfscl_getlayout lyp=%p\n", lyp); if (lyp == NULL) islocked = 0; else if (flp != NULL) islocked = 1; else islocked = 2; if ((lyp == NULL || flp == NULL) && recalled == 0) { LIST_INIT(&flh); tsep = nfsmnt_mdssession(nmp); layoutlen = tsep->nfsess_maxcache - (NFSX_STATEID + 3 * NFSX_UNSIGNED); if (lyp == NULL) usecurstateid = 1; else { usecurstateid = 0; stateid.seqid = lyp->nfsly_stateid.seqid; stateid.other[0] = lyp->nfsly_stateid.other[0]; stateid.other[1] = lyp->nfsly_stateid.other[1]; stateid.other[2] = lyp->nfsly_stateid.other[2]; } error = nfsrpc_openlayoutrpc(nmp, vp, nfhp, fhlen, newfhp, newfhlen, mode, op, name, namelen, dpp, &stateid, usecurstateid, layouttype, layoutlen, &retonclose, &flh, &laystat, cred, p); NFSCL_DEBUG(4, "aft nfsrpc_openlayoutrpc laystat=%d err=%d\n", laystat, error); laystat = nfsrpc_layoutgetres(nmp, vp, newfhp, newfhlen, &stateid, retonclose, NULL, &lyp, &flh, layouttype, laystat, &islocked, cred, p); } else error = nfsrpc_openrpc(nmp, vp, nfhp, fhlen, newfhp, newfhlen, mode, op, name, namelen, dpp, 0, 0, cred, p, 0, 0); if (islocked == 2) nfscl_rellayout(lyp, 1); else if (islocked == 1) nfscl_rellayout(lyp, 0); return (error); } /* * This function does an Open+LayoutGet for an NFSv4.1 mount with pNFS * enabled, only for the CLAIM_NULL case. All other NFSv4 Opens are * handled by nfsrpc_openrpc(). * For the case where op == NULL, dvp is the directory. When op != NULL, it * can be NULL. */ static int nfsrpc_openlayoutrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, uint8_t *newfhp, int newfhlen, uint32_t mode, struct nfsclopen *op, uint8_t *name, int namelen, struct nfscldeleg **dpp, nfsv4stateid_t *stateidp, int usecurstateid, int layouttype, int layoutlen, int *retonclosep, struct nfsclflayouthead *flhp, int *laystatp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfscldeleg *ndp = NULL; struct nfsvattr nfsva; struct nfsclsession *tsep; uint32_t rflags, deleg; nfsattrbit_t attrbits; int error, ret, acesize, limitby, iomode; *dpp = NULL; *laystatp = ENXIO; nfscl_reqstart(nd, NFSPROC_OPENLAYGET, nmp, nfhp, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); *tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH); *tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; nfsm_strtom(nd, op->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_NOCREATE); *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); nfsm_strtom(nd, name, namelen); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY); nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_LAYOUTGET); if ((mode & NFSV4OPEN_ACCESSWRITE) != 0) iomode = NFSLAYOUTIOMODE_RW; else iomode = NFSLAYOUTIOMODE_READ; nfsrv_setuplayoutget(nd, iomode, 0, UINT64_MAX, 0, stateidp, layouttype, layoutlen, usecurstateid); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd); if (nd->nd_repstat != 0) *laystatp = nd->nd_repstat; if ((nd->nd_flag & ND_NOMOREDATA) == 0) { /* ND_NOMOREDATA will be set if the Open operation failed. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); op->nfso_stateid.seqid = *tl++; op->nfso_stateid.other[0] = *tl++; op->nfso_stateid.other[1] = *tl++; op->nfso_stateid.other[2] = *tl; rflags = fxdr_unsigned(u_int32_t, *(tl + 6)); error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error != 0) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); deleg = fxdr_unsigned(u_int32_t, *tl); if (deleg == NFSV4OPEN_DELEGATEREAD || deleg == NFSV4OPEN_DELEGATEWRITE) { if (!(op->nfso_own->nfsow_clp->nfsc_flags & NFSCLFLAGS_FIRSTDELEG)) op->nfso_own->nfsow_clp->nfsc_flags |= (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG); ndp = malloc(sizeof(struct nfscldeleg) + newfhlen, M_NFSCLDELEG, M_WAITOK); LIST_INIT(&ndp->nfsdl_owner); LIST_INIT(&ndp->nfsdl_lock); ndp->nfsdl_clp = op->nfso_own->nfsow_clp; ndp->nfsdl_fhlen = newfhlen; NFSBCOPY(newfhp, ndp->nfsdl_fh, newfhlen); newnfs_copyincred(cred, &ndp->nfsdl_cred); nfscl_lockinit(&ndp->nfsdl_rwlock); NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); ndp->nfsdl_stateid.seqid = *tl++; ndp->nfsdl_stateid.other[0] = *tl++; ndp->nfsdl_stateid.other[1] = *tl++; ndp->nfsdl_stateid.other[2] = *tl++; ret = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEWRITE) { ndp->nfsdl_flags = NFSCLDL_WRITE; /* * Indicates how much the file can grow. */ NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); limitby = fxdr_unsigned(int, *tl++); switch (limitby) { case NFSV4OPEN_LIMITSIZE: ndp->nfsdl_sizelimit = fxdr_hyper(tl); break; case NFSV4OPEN_LIMITBLOCKS: ndp->nfsdl_sizelimit = fxdr_unsigned(u_int64_t, *tl++); ndp->nfsdl_sizelimit *= fxdr_unsigned(u_int64_t, *tl); break; default: error = NFSERR_BADXDR; goto nfsmout; }; } else ndp->nfsdl_flags = NFSCLDL_READ; if (ret != 0) ndp->nfsdl_flags |= NFSCLDL_RECALL; error = nfsrv_dissectace(nd, &ndp->nfsdl_ace, &ret, &acesize, p); if (error != 0) goto nfsmout; } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; } if ((rflags & NFSV4OPEN_LOCKTYPEPOSIX) != 0 || nfscl_assumeposixlocks) op->nfso_posixlock = 1; else op->nfso_posixlock = 0; NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); /* If the 2nd element == NFS_OK, the Getattr succeeded. */ if (*++tl == 0) { error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, cred); if (error != 0) goto nfsmout; if (ndp != NULL) { ndp->nfsdl_change = nfsva.na_filerev; ndp->nfsdl_modtime = nfsva.na_mtime; ndp->nfsdl_flags |= NFSCLDL_MODTIMESET; *dpp = ndp; ndp = NULL; } /* * At this point, the Open has succeeded, so set * nd_repstat = NFS_OK. If the Layoutget failed, * this function just won't return a layout. */ if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); *laystatp = fxdr_unsigned(int, *++tl); if (*laystatp == 0) { error = nfsrv_parselayoutget(nmp, nd, stateidp, retonclosep, flhp); if (error != 0) *laystatp = error; } } else nd->nd_repstat = 0; /* Return 0 for Open. */ } } if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; nfsmout: free(ndp, M_NFSCLDELEG); m_freem(nd->nd_mrep); return (error); } /* * Similar nfsrpc_createv4(), but also does the LayoutGet operation. * Used only for mounts with pNFS enabled. */ static int nfsrpc_createlayout(vnode_t dvp, char *name, int namelen, struct vattr *vap, nfsquad_t cverf, int fmode, struct nfsclowner *owp, struct nfscldeleg **dpp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff, int *unlockedp, nfsv4stateid_t *stateidp, int usecurstateid, int layouttype, int layoutlen, int *retonclosep, struct nfsclflayouthead *flhp, int *laystatp) { uint32_t *tl; int error = 0, deleg, newone, ret, acesize, limitby; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsclopen *op; struct nfscldeleg *dp = NULL; struct nfsnode *np; struct nfsfh *nfhp; struct nfsclsession *tsep; nfsattrbit_t attrbits; nfsv4stateid_t stateid; struct nfsmount *nmp; nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); *laystatp = ENXIO; *unlockedp = 0; *nfhpp = NULL; *dpp = NULL; *attrflagp = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_CREATELAYGET, dvp); /* * For V4, this is actually an Open op. */ NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(owp->nfsow_seqid); *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; nfsm_strtom(nd, owp->nfsow_owner, NFSV4CL_LOCKNAMELEN); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_CREATE); if ((fmode & O_EXCL) != 0) { if (NFSHASSESSPERSIST(nmp)) { /* Use GUARDED for persistent sessions. */ *tl = txdr_unsigned(NFSCREATE_GUARDED); nfscl_fillsattr(nd, vap, dvp, 0, 0); } else { /* Otherwise, use EXCLUSIVE4_1. */ *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; nfscl_fillsattr(nd, vap, dvp, 0, 0); } } else { *tl = txdr_unsigned(NFSCREATE_UNCHECKED); nfscl_fillsattr(nd, vap, dvp, 0, 0); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); nfsm_strtom(nd, name, namelen); /* Get the new file's handle and attributes, plus save the FH. */ NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_SAVEFH); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); /* Get the directory's post-op attributes. */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_RESTOREFH); *tl = txdr_unsigned(NFSV4OP_LAYOUTGET); nfsrv_setuplayoutget(nd, NFSLAYOUTIOMODE_RW, 0, UINT64_MAX, 0, stateidp, layouttype, layoutlen, usecurstateid); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error != 0) return (error); NFSCL_DEBUG(4, "nfsrpc_createlayout stat=%d err=%d\n", nd->nd_repstat, error); if (nd->nd_repstat != 0) *laystatp = nd->nd_repstat; NFSCL_INCRSEQID(owp->nfsow_seqid, nd); if ((nd->nd_flag & ND_NOMOREDATA) == 0) { NFSCL_DEBUG(4, "nfsrpc_createlayout open succeeded\n"); NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); stateid.seqid = *tl++; stateid.other[0] = *tl++; stateid.other[1] = *tl++; stateid.other[2] = *tl; nfsrv_getattrbits(nd, &attrbits, NULL, NULL); NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); deleg = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEREAD || deleg == NFSV4OPEN_DELEGATEWRITE) { if (!(owp->nfsow_clp->nfsc_flags & NFSCLFLAGS_FIRSTDELEG)) owp->nfsow_clp->nfsc_flags |= (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG); dp = malloc(sizeof(struct nfscldeleg) + NFSX_V4FHMAX, M_NFSCLDELEG, M_WAITOK); LIST_INIT(&dp->nfsdl_owner); LIST_INIT(&dp->nfsdl_lock); dp->nfsdl_clp = owp->nfsow_clp; newnfs_copyincred(cred, &dp->nfsdl_cred); nfscl_lockinit(&dp->nfsdl_rwlock); NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); dp->nfsdl_stateid.seqid = *tl++; dp->nfsdl_stateid.other[0] = *tl++; dp->nfsdl_stateid.other[1] = *tl++; dp->nfsdl_stateid.other[2] = *tl++; ret = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEWRITE) { dp->nfsdl_flags = NFSCLDL_WRITE; /* * Indicates how much the file can grow. */ NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); limitby = fxdr_unsigned(int, *tl++); switch (limitby) { case NFSV4OPEN_LIMITSIZE: dp->nfsdl_sizelimit = fxdr_hyper(tl); break; case NFSV4OPEN_LIMITBLOCKS: dp->nfsdl_sizelimit = fxdr_unsigned(u_int64_t, *tl++); dp->nfsdl_sizelimit *= fxdr_unsigned(u_int64_t, *tl); break; default: error = NFSERR_BADXDR; goto nfsmout; }; } else { dp->nfsdl_flags = NFSCLDL_READ; } if (ret != 0) dp->nfsdl_flags |= NFSCLDL_RECALL; error = nfsrv_dissectace(nd, &dp->nfsdl_ace, &ret, &acesize, p); if (error != 0) goto nfsmout; } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; } /* Now, we should have the status for the SaveFH. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl == 0) { NFSCL_DEBUG(4, "nfsrpc_createlayout SaveFH ok\n"); /* * Now, process the GetFH and Getattr for the newly * created file. nfscl_mtofh() will set * ND_NOMOREDATA if these weren't successful. */ error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); NFSCL_DEBUG(4, "aft nfscl_mtofh err=%d\n", error); if (error != 0) goto nfsmout; } else nd->nd_flag |= ND_NOMOREDATA; /* Now we have the PutFH and Getattr for the directory. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) nd->nd_flag |= ND_NOMOREDATA; else { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) nd->nd_flag |= ND_NOMOREDATA; } } if ((nd->nd_flag & ND_NOMOREDATA) == 0) { /* Load the directory attributes. */ error = nfsm_loadattr(nd, dnap); NFSCL_DEBUG(4, "aft nfsm_loadattr err=%d\n", error); if (error != 0) goto nfsmout; *dattrflagp = 1; if (dp != NULL && *attrflagp != 0) { dp->nfsdl_change = nnap->na_filerev; dp->nfsdl_modtime = nnap->na_mtime; dp->nfsdl_flags |= NFSCLDL_MODTIMESET; } /* * We can now complete the Open state. */ nfhp = *nfhpp; if (dp != NULL) { dp->nfsdl_fhlen = nfhp->nfh_len; NFSBCOPY(nfhp->nfh_fh, dp->nfsdl_fh, nfhp->nfh_len); } /* * Get an Open structure that will be * attached to the OpenOwner, acquired already. */ error = nfscl_open(dvp, nfhp->nfh_fh, nfhp->nfh_len, (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), 0, cred, p, NULL, &op, &newone, NULL, 0, false); if (error != 0) goto nfsmout; op->nfso_stateid = stateid; newnfs_copyincred(cred, &op->nfso_cred); nfscl_openrelease(nmp, op, error, newone); *unlockedp = 1; /* Now, handle the RestoreFH and LayoutGet. */ if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 4 * NFSX_UNSIGNED); *laystatp = fxdr_unsigned(int, *(tl + 3)); if (*laystatp == 0) { error = nfsrv_parselayoutget(nmp, nd, stateidp, retonclosep, flhp); if (error != 0) *laystatp = error; } NFSCL_DEBUG(4, "aft nfsrv_parselayout err=%d\n", error); } else nd->nd_repstat = 0; } } if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; if (error == NFSERR_STALECLIENTID || error == NFSERR_BADSESSION) nfscl_initiate_recovery(owp->nfsow_clp); nfsmout: NFSCL_DEBUG(4, "eo nfsrpc_createlayout err=%d\n", error); if (error == 0) *dpp = dp; else free(dp, M_NFSCLDELEG); m_freem(nd->nd_mrep); return (error); } /* * Similar to nfsrpc_getopenlayout(), except that it used for the Create case. */ static int nfsrpc_getcreatelayout(vnode_t dvp, char *name, int namelen, struct vattr *vap, nfsquad_t cverf, int fmode, struct nfsclowner *owp, struct nfscldeleg **dpp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff, int *unlockedp) { struct nfscllayout *lyp; struct nfsclflayouthead flh; struct nfsfh *nfhp; struct nfsclsession *tsep; struct nfsmount *nmp; nfsv4stateid_t stateid; int error, layoutlen, layouttype, retonclose, laystat; error = 0; nmp = VFSTONFS(dvp->v_mount); if (NFSHASFLEXFILE(nmp)) layouttype = NFSLAYOUT_FLEXFILE; else layouttype = NFSLAYOUT_NFSV4_1_FILES; LIST_INIT(&flh); tsep = nfsmnt_mdssession(nmp); layoutlen = tsep->nfsess_maxcache - (NFSX_STATEID + 3 * NFSX_UNSIGNED); error = nfsrpc_createlayout(dvp, name, namelen, vap, cverf, fmode, owp, dpp, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp, dstuff, unlockedp, &stateid, 1, layouttype, layoutlen, &retonclose, &flh, &laystat); NFSCL_DEBUG(4, "aft nfsrpc_createlayoutrpc laystat=%d err=%d\n", laystat, error); lyp = NULL; if (laystat == 0) { nfhp = *nfhpp; laystat = nfsrpc_layoutgetres(nmp, dvp, nfhp->nfh_fh, nfhp->nfh_len, &stateid, retonclose, NULL, &lyp, &flh, layouttype, laystat, NULL, cred, p); } else laystat = nfsrpc_layoutgetres(nmp, dvp, NULL, 0, &stateid, retonclose, NULL, &lyp, &flh, layouttype, laystat, NULL, cred, p); if (laystat == 0) nfscl_rellayout(lyp, 0); return (error); } /* * Process the results of a layoutget() operation. */ static int nfsrpc_layoutgetres(struct nfsmount *nmp, vnode_t vp, uint8_t *newfhp, int newfhlen, nfsv4stateid_t *stateidp, int retonclose, uint32_t *notifybit, struct nfscllayout **lypp, struct nfsclflayouthead *flhp, int layouttype, int laystat, int *islockedp, struct ucred *cred, NFSPROC_T *p) { struct nfsclflayout *tflp; struct nfscldevinfo *dip; uint8_t *dev; int i, mirrorcnt; if (laystat == NFSERR_UNKNLAYOUTTYPE) { NFSLOCKMNT(nmp); if (!NFSHASFLEXFILE(nmp)) { /* Switch to using Flex File Layout. */ nmp->nm_state |= NFSSTA_FLEXFILE; } else if (layouttype == NFSLAYOUT_FLEXFILE) { /* Disable pNFS. */ NFSCL_DEBUG(1, "disable PNFS\n"); nmp->nm_state &= ~(NFSSTA_PNFS | NFSSTA_FLEXFILE); } NFSUNLOCKMNT(nmp); } if (laystat == 0) { NFSCL_DEBUG(4, "nfsrpc_layoutgetres at FOREACH\n"); LIST_FOREACH(tflp, flhp, nfsfl_list) { if (layouttype == NFSLAYOUT_FLEXFILE) mirrorcnt = tflp->nfsfl_mirrorcnt; else mirrorcnt = 1; for (i = 0; i < mirrorcnt; i++) { laystat = nfscl_adddevinfo(nmp, NULL, i, tflp); NFSCL_DEBUG(4, "aft adddev=%d\n", laystat); if (laystat != 0) { if (layouttype == NFSLAYOUT_FLEXFILE) dev = tflp->nfsfl_ffm[i].dev; else dev = tflp->nfsfl_dev; laystat = nfsrpc_getdeviceinfo(nmp, dev, layouttype, notifybit, &dip, cred, p); NFSCL_DEBUG(4, "aft nfsrpc_gdi=%d\n", laystat); if (laystat != 0) goto out; laystat = nfscl_adddevinfo(nmp, dip, i, tflp); if (laystat != 0) printf("nfsrpc_layoutgetresout" ": cannot add\n"); } } } } out: if (laystat == 0) { /* * nfscl_layout() always returns with the nfsly_lock * set to a refcnt (shared lock). * Passing in dvp is sufficient, since it is only used to * get the fsid for the file system. */ laystat = nfscl_layout(nmp, vp, newfhp, newfhlen, stateidp, layouttype, retonclose, flhp, lypp, cred, p); NFSCL_DEBUG(4, "nfsrpc_layoutgetres: aft nfscl_layout=%d\n", laystat); if (laystat == 0 && islockedp != NULL) *islockedp = 1; } return (laystat); } /* * nfs copy_file_range operation. */ int nfsrpc_copy_file_range(vnode_t invp, off_t *inoffp, vnode_t outvp, off_t *outoffp, size_t *lenp, unsigned int flags, int *inattrflagp, struct nfsvattr *innap, int *outattrflagp, struct nfsvattr *outnap, struct ucred *cred, bool consecutive, bool *must_commitp) { int commit, error, expireret = 0, retrycnt; u_int32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(invp->v_mount); struct nfsfh *innfhp = NULL, *outnfhp = NULL; nfsv4stateid_t instateid, outstateid; void *inlckp, *outlckp; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; innfhp = VTONFS(invp)->n_fhp; outnfhp = VTONFS(outvp)->n_fhp; retrycnt = 0; do { /* Get both stateids. */ inlckp = NULL; nfscl_getstateid(invp, innfhp->nfh_fh, innfhp->nfh_len, NFSV4OPEN_ACCESSREAD, 0, NULL, curthread, &instateid, &inlckp); outlckp = NULL; nfscl_getstateid(outvp, outnfhp->nfh_fh, outnfhp->nfh_len, NFSV4OPEN_ACCESSWRITE, 0, NULL, curthread, &outstateid, &outlckp); error = nfsrpc_copyrpc(invp, *inoffp, outvp, *outoffp, lenp, &instateid, &outstateid, innap, inattrflagp, outnap, outattrflagp, consecutive, &commit, cred, curthread); if (error == 0) { if (commit != NFSWRITE_FILESYNC) *must_commitp = true; *inoffp += *lenp; *outoffp += *lenp; } else if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (inlckp != NULL) nfscl_lockderef(inlckp); if (outlckp != NULL) nfscl_lockderef(outlckp); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_cfr"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, curthread); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_DELAY || error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || error == NFSERR_STALEDONTRECOVER || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error != 0 && (retrycnt >= 4 || error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || error == NFSERR_STALEDONTRECOVER)) error = EIO; return (error); } /* * The copy RPC. */ static int nfsrpc_copyrpc(vnode_t invp, off_t inoff, vnode_t outvp, off_t outoff, size_t *lenp, nfsv4stateid_t *instateidp, nfsv4stateid_t *outstateidp, struct nfsvattr *innap, int *inattrflagp, struct nfsvattr *outnap, int *outattrflagp, bool consecutive, int *commitp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct nfsmount *nmp; nfsattrbit_t attrbits; uint64_t len; nmp = VFSTONFS(outvp->v_mount); *inattrflagp = *outattrflagp = 0; *commitp = NFSWRITE_UNSTABLE; len = *lenp; *lenp = 0; if (len > nfs_maxcopyrange) len = nfs_maxcopyrange; NFSCL_REQSTART(nd, NFSPROC_COPY, invp); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); nfsm_fhtom(nd, VTONFS(outvp)->n_fhp->nfh_fh, VTONFS(outvp)->n_fhp->nfh_len, 0); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_COPY); nfsm_stateidtom(nd, instateidp, NFSSTATEID_PUTSTATEID); nfsm_stateidtom(nd, outstateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, 3 * NFSX_HYPER + 4 * NFSX_UNSIGNED); txdr_hyper(inoff, tl); tl += 2; txdr_hyper(outoff, tl); tl += 2; txdr_hyper(len, tl); tl += 2; if (consecutive) *tl++ = newnfs_true; else *tl++ = newnfs_false; *tl++ = newnfs_true; *tl++ = 0; *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSWRITEGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, invp, p, cred, NULL); if (error != 0) return (error); if ((nd->nd_flag & ND_NOMOREDATA) == 0) { /* Get the input file's attributes. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1) == 0) { error = nfsm_loadattr(nd, innap); if (error != 0) goto nfsmout; *inattrflagp = 1; } else nd->nd_flag |= ND_NOMOREDATA; } /* Skip over return stat for PutFH. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) nd->nd_flag |= ND_NOMOREDATA; } /* Skip over return stat for Copy. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); if (*tl != 0) { /* There should be no callback ids. */ error = NFSERR_BADXDR; goto nfsmout; } NFSM_DISSECT(tl, uint32_t *, NFSX_HYPER + 3 * NFSX_UNSIGNED + NFSX_VERF); len = fxdr_hyper(tl); tl += 2; *commitp = fxdr_unsigned(int, *tl++); NFSLOCKMNT(nmp); if (!NFSHASWRITEVERF(nmp)) { NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); NFSSETWRITEVERF(nmp); } else if (NFSBCMP(tl, nmp->nm_verf, NFSX_VERF)) { NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); nd->nd_repstat = NFSERR_STALEWRITEVERF; } NFSUNLOCKMNT(nmp); tl += (NFSX_VERF / NFSX_UNSIGNED); if (nd->nd_repstat == 0 && *++tl != newnfs_true) /* Must be a synchronous copy. */ nd->nd_repstat = NFSERR_NOTSUPP; NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, outnap); if (error == 0) *outattrflagp = NFS_LATTR_NOSHRINK; if (nd->nd_repstat == 0) *lenp = len; } else if (nd->nd_repstat == NFSERR_OFFLOADNOREQS) { /* * For the case where consecutive is not supported, but * synchronous is supported, we can try consecutive == false * by returning this error. Otherwise, return NFSERR_NOTSUPP, * since Copy cannot be done. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (!consecutive || *++tl == newnfs_false) nd->nd_repstat = NFSERR_NOTSUPP; } else nd->nd_repstat = NFSERR_BADXDR; } if (error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Seek operation. */ int nfsrpc_seek(vnode_t vp, off_t *offp, bool *eofp, int content, struct ucred *cred, struct nfsvattr *nap, int *attrflagp) { int error, expireret = 0, retrycnt; u_int32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *np = VTONFS(vp); struct nfsfh *nfhp = NULL; nfsv4stateid_t stateid; void *lckp; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; nfhp = np->n_fhp; retrycnt = 0; do { lckp = NULL; nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, NFSV4OPEN_ACCESSREAD, 0, cred, curthread, &stateid, &lckp); error = nfsrpc_seekrpc(vp, offp, &stateid, eofp, content, nap, attrflagp, cred); if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_seek"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, curthread); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4) || (error == NFSERR_OPENMODE && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; return (error); } /* * The seek RPC. */ static int nfsrpc_seekrpc(vnode_t vp, off_t *offp, nfsv4stateid_t *stateidp, bool *eofp, int content, struct nfsvattr *nap, int *attrflagp, struct ucred *cred) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_SEEK, vp); nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); txdr_hyper(*offp, tl); tl += 2; *tl++ = txdr_unsigned(content); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, curthread, cred, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED + NFSX_HYPER); if (*tl++ == newnfs_true) *eofp = true; else *eofp = false; *offp = fxdr_hyper(tl); /* Just skip over Getattr op status. */ error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = 1; } error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * The getextattr RPC. */ int nfsrpc_getextattr(vnode_t vp, const char *name, struct uio *uiop, ssize_t *lenp, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; uint32_t len, len2; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_GETEXTATTR, vp); nfsm_strtom(nd, name, strlen(name)); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(uint32_t, *tl); /* Sanity check lengths. */ if (uiop != NULL && len > 0 && len <= IOSIZE_MAX && uiop->uio_resid <= UINT32_MAX) { len2 = uiop->uio_resid; if (len2 >= len) error = nfsm_mbufuio(nd, uiop, len); else { error = nfsm_mbufuio(nd, uiop, len2); if (error == 0) { /* * nfsm_mbufuio() advances to a multiple * of 4, so round up len2 as well. Then * we need to advance over the rest of * the data, rounding up the remaining * length. */ len2 = NFSM_RNDUP(len2); len2 = NFSM_RNDUP(len - len2); if (len2 > 0) error = nfsm_advance(nd, len2, -1); } } } else if (uiop == NULL && len > 0) { /* Just wants the length and not the data. */ error = nfsm_advance(nd, NFSM_RNDUP(len), -1); } else if (len > 0) error = ENOATTR; if (error != 0) goto nfsmout; *lenp = len; /* Just skip over Getattr op status. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = 1; } if (error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * The setextattr RPC. */ int nfsrpc_setextattr(vnode_t vp, const char *name, struct uio *uiop, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_SETEXTATTR, vp); if (uiop->uio_resid > nd->nd_maxreq) { /* nd_maxreq is set by NFSCL_REQSTART(). */ m_freem(nd->nd_mreq); return (EINVAL); } NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4SXATTR_EITHER); nfsm_strtom(nd, name, strlen(name)); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(uiop->uio_resid); nfsm_uiombuf(nd, uiop, uiop->uio_resid); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { /* Just skip over the reply and Getattr op status. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER + 3 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = 1; } if (error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * The removeextattr RPC. */ int nfsrpc_rmextattr(vnode_t vp, const char *name, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_RMEXTATTR, vp); nfsm_strtom(nd, name, strlen(name)); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { /* Just skip over the reply and Getattr op status. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER + 3 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = 1; } if (error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * The listextattr RPC. */ int nfsrpc_listextattr(vnode_t vp, uint64_t *cookiep, struct uio *uiop, size_t *lenp, bool *eofp, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int cnt, error, i, len; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; u_char c; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_LISTEXTATTR, vp); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); txdr_hyper(*cookiep, tl); tl += 2; *tl++ = txdr_unsigned(*lenp); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, NULL); if (error != 0) return (error); *eofp = true; *lenp = 0; if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED); *cookiep = fxdr_hyper(tl); tl += 2; cnt = fxdr_unsigned(int, *tl); if (cnt < 0) { error = EBADRPC; goto nfsmout; } for (i = 0; i < cnt; i++) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len <= 0 || len > EXTATTR_MAXNAMELEN) { error = EBADRPC; goto nfsmout; } if (uiop == NULL) error = nfsm_advance(nd, NFSM_RNDUP(len), -1); else if (uiop->uio_resid >= len + 1) { c = len; error = uiomove(&c, sizeof(c), uiop); if (error == 0) error = nfsm_mbufuio(nd, uiop, len); } else { error = nfsm_advance(nd, NFSM_RNDUP(len), -1); *eofp = false; } if (error != 0) goto nfsmout; *lenp += (len + 1); } /* Get the eof and skip over the Getattr op status. */ NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED); /* * *eofp is set false above, because it wasn't able to copy * all of the reply. */ if (*eofp && *tl == 0) *eofp = false; error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = 1; } if (error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Split an mbuf list. For non-M_EXTPG mbufs, just use m_split(). */ static struct mbuf * nfsm_split(struct mbuf *mp, uint64_t xfer) { struct mbuf *m, *m2; vm_page_t pg; int i, j, left, pgno, plen, trim; char *cp, *cp2; if ((mp->m_flags & M_EXTPG) == 0) { m = m_split(mp, xfer, M_WAITOK); return (m); } /* Find the correct mbuf to split at. */ for (m = mp; m != NULL && xfer > m->m_len; m = m->m_next) xfer -= m->m_len; if (m == NULL) return (NULL); /* If xfer == m->m_len, we can just split the mbuf list. */ if (xfer == m->m_len) { m2 = m->m_next; m->m_next = NULL; return (m2); } /* Find the page to split at. */ pgno = 0; left = xfer; do { if (pgno == 0) plen = m_epg_pagelen(m, 0, m->m_epg_1st_off); else plen = m_epg_pagelen(m, pgno, 0); if (left <= plen) break; left -= plen; pgno++; } while (pgno < m->m_epg_npgs); if (pgno == m->m_epg_npgs) panic("nfsm_split: eroneous ext_pgs mbuf"); m2 = mb_alloc_ext_pgs(M_WAITOK, mb_free_mext_pgs); m2->m_epg_flags |= EPG_FLAG_ANON; /* * If left < plen, allocate a new page for the new mbuf * and copy the data after left in the page to this new * page. */ if (left < plen) { - do { - pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | - VM_ALLOC_WIRED); - if (pg == NULL) - vm_wait(NULL); - } while (pg == NULL); + pg = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_NODUMP | + VM_ALLOC_WIRED); m2->m_epg_pa[0] = VM_PAGE_TO_PHYS(pg); m2->m_epg_npgs = 1; /* Copy the data after left to the new page. */ trim = plen - left; cp = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[pgno]); if (pgno == 0) cp += m->m_epg_1st_off; cp += left; cp2 = (char *)(void *)PHYS_TO_DMAP(m2->m_epg_pa[0]); if (pgno == m->m_epg_npgs - 1) m2->m_epg_last_len = trim; else { cp2 += PAGE_SIZE - trim; m2->m_epg_1st_off = PAGE_SIZE - trim; m2->m_epg_last_len = m->m_epg_last_len; } memcpy(cp2, cp, trim); m2->m_len = trim; } else { m2->m_len = 0; m2->m_epg_last_len = m->m_epg_last_len; } /* Move the pages beyond pgno to the new mbuf. */ for (i = pgno + 1, j = m2->m_epg_npgs; i < m->m_epg_npgs; i++, j++) { m2->m_epg_pa[j] = m->m_epg_pa[i]; /* Never moves page 0. */ m2->m_len += m_epg_pagelen(m, i, 0); } m2->m_epg_npgs = j; m->m_epg_npgs = pgno + 1; m->m_epg_last_len = left; m->m_len = xfer; m2->m_next = m->m_next; m->m_next = NULL; return (m2); } /* * Do the NFSv4.1 Bind Connection to Session. * Called from the reconnect layer of the krpc (sys/rpc/clnt_rc.c). */ void nfsrpc_bindconnsess(CLIENT *cl, void *arg, struct ucred *cr) { struct nfscl_reconarg *rcp = (struct nfscl_reconarg *)arg; uint32_t res, *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct rpc_callextra ext; struct timeval utimeout; enum clnt_stat stat; int error; nfscl_reqstart(nd, NFSPROC_BINDCONNTOSESS, NULL, NULL, 0, NULL, NULL, NFS_VER4, rcp->minorvers); NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID + 2 * NFSX_UNSIGNED); memcpy(tl, rcp->sessionid, NFSX_V4SESSIONID); tl += NFSX_V4SESSIONID / NFSX_UNSIGNED; *tl++ = txdr_unsigned(NFSCDFC4_FORE_OR_BOTH); *tl = newnfs_false; memset(&ext, 0, sizeof(ext)); utimeout.tv_sec = 30; utimeout.tv_usec = 0; ext.rc_auth = authunix_create(cr); nd->nd_mrep = NULL; stat = CLNT_CALL_MBUF(cl, &ext, NFSV4PROC_COMPOUND, nd->nd_mreq, &nd->nd_mrep, utimeout); AUTH_DESTROY(ext.rc_auth); if (stat != RPC_SUCCESS) { printf("nfsrpc_bindconnsess: call failed stat=%d\n", stat); return; } if (nd->nd_mrep == NULL) { printf("nfsrpc_bindconnsess: no reply args\n"); return; } error = 0; newnfs_realign(&nd->nd_mrep, M_WAITOK); nd->nd_md = nd->nd_mrep; nd->nd_dpos = mtod(nd->nd_md, char *); NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); nd->nd_repstat = fxdr_unsigned(uint32_t, *tl++); if (nd->nd_repstat == NFSERR_OK) { res = fxdr_unsigned(uint32_t, *tl); if (res > 0 && (error = nfsm_advance(nd, NFSM_RNDUP(res), -1)) != 0) goto nfsmout; NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID + 4 * NFSX_UNSIGNED); tl += 3; if (!NFSBCMP(tl, rcp->sessionid, NFSX_V4SESSIONID)) { tl += NFSX_V4SESSIONID / NFSX_UNSIGNED; res = fxdr_unsigned(uint32_t, *tl); if (res != NFSCDFS4_BOTH) printf("nfsrpc_bindconnsess: did not " "return FS4_BOTH\n"); } else printf("nfsrpc_bindconnsess: not same " "sessionid\n"); } else if (nd->nd_repstat != NFSERR_BADSESSION) printf("nfsrpc_bindconnsess: returned %d\n", nd->nd_repstat); nfsmout: if (error != 0) printf("nfsrpc_bindconnsess: reply bad xdr\n"); m_freem(nd->nd_mrep); } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index ffd41304be43..62fcbcd90b0b 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -1,6411 +1,6397 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * Copyright (c) 2018 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_apic.h" #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #include #endif #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * PTmap is recursive pagemap at top of virtual address space. * Within PTmap, the page directory can be found (third indirection). */ #define PTmap ((pt_entry_t *)(PTDPTDI << PDRSHIFT)) #define PTD ((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE))) #define PTDpde ((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE) + \ (PTDPTDI * PDESIZE))) /* * Translate a virtual address to the kernel virtual address of its page table * entry (PTE). This can be used recursively. If the address of a PTE as * previously returned by this macro is itself given as the argument, then the * address of the page directory entry (PDE) that maps the PTE will be * returned. * * This macro may be used before pmap_bootstrap() is called. */ #define vtopte(va) (PTmap + i386_btop(va)) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) static int pgeflag = 0; /* PG_G or-in */ static int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; #ifdef PMAP_PAE_COMP pt_entry_t pg_nx; static uma_zone_t pdptzone; #else #define pg_nx 0 #endif _Static_assert(VM_MAXUSER_ADDRESS == VADDR(TRPTDI, 0), "VM_MAXUSER_ADDRESS"); _Static_assert(VM_MAX_KERNEL_ADDRESS <= VADDR(PTDPTDI, 0), "VM_MAX_KERNEL_ADDRESS"); _Static_assert(PMAP_MAP_LOW == VADDR(LOWPTDI, 0), "PMAP_MAP_LOW"); _Static_assert(KERNLOAD == (KERNPTDI << PDRSHIFT), "KERNLOAD"); extern int pat_works; extern int pg_ps_enabled; extern int elf32_nxstack; #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); extern int pv_entry_max, pv_entry_count; static int pv_entry_high_water = 0; static struct md_page *pv_table; extern int shpgperproc; static struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ static int pv_maxchunks; /* How many chunks we have KVA for */ static vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ static pt_entry_t *CMAP3; static pd_entry_t *KPTD; static caddr_t CADDR3; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = NULL, *PMAP2, *PMAP3; static pt_entry_t *PADDR1 = NULL, *PADDR2, *PADDR3; #ifdef SMP static int PMAP1cpu, PMAP3cpu; extern int PMAP1changedcpu; #endif extern int PMAP1changed; extern int PMAP1unchanged; static struct mtx PMAP2mutex; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static bool pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, struct spglist *free); static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #ifdef PMAP_PAE_COMP static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait); #endif static void pmap_init_trm(void); static void pmap_invalidate_all_int(pmap_t pmap); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); extern char _end[]; extern u_long physfree; /* phys addr of next free page */ extern u_long vm86phystk;/* PA of vm86/bios stack */ extern u_long vm86paddr;/* address of vm86 region */ extern int vm86pa; /* phys addr of vm86 region */ extern u_long KERNend; /* phys addr end of kernel (just after bss) */ #ifdef PMAP_PAE_COMP pd_entry_t *IdlePTD_pae; /* phys addr of kernel PTD */ pdpt_entry_t *IdlePDPT; /* phys addr of kernel PDPT */ pt_entry_t *KPTmap_pae; /* address of kernel page tables */ #define IdlePTD IdlePTD_pae #define KPTmap KPTmap_pae #else pd_entry_t *IdlePTD_nopae; pt_entry_t *KPTmap_nopae; #define IdlePTD IdlePTD_nopae #define KPTmap KPTmap_nopae #endif extern u_long KPTphys; /* phys addr of kernel page tables */ extern u_long tramp_idleptd; static u_long allocpages(u_int cnt, u_long *physfree) { u_long res; res = *physfree; *physfree += PAGE_SIZE * cnt; bzero((void *)res, PAGE_SIZE * cnt); return (res); } static void pmap_cold_map(u_long pa, u_long va, u_long cnt) { pt_entry_t *pt; for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0; cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE) *pt = pa | PG_V | PG_RW | PG_A | PG_M; } static void pmap_cold_mapident(u_long pa, u_long cnt) { pmap_cold_map(pa, pa, cnt); } _Static_assert(LOWPTDI * 2 * NBPDR == KERNBASE, "Broken double-map of zero PTD"); static void __CONCAT(PMTYPE, remap_lower)(bool enable) { int i; for (i = 0; i < LOWPTDI; i++) IdlePTD[i] = enable ? IdlePTD[LOWPTDI + i] : 0; load_cr3(rcr3()); /* invalidate TLB */ } /* * Called from locore.s before paging is enabled. Sets up the first * kernel page table. Since kernel is mapped with PA == VA, this code * does not require relocations. */ void __CONCAT(PMTYPE, cold)(void) { pt_entry_t *pt; u_long a; u_int cr3, ncr4; physfree = (u_long)&_end; if (bootinfo.bi_esymtab != 0) physfree = bootinfo.bi_esymtab; if (bootinfo.bi_kernend != 0) physfree = bootinfo.bi_kernend; physfree = roundup2(physfree, NBPDR); KERNend = physfree; /* Allocate Kernel Page Tables */ KPTphys = allocpages(NKPT, &physfree); KPTmap = (pt_entry_t *)KPTphys; /* Allocate Page Table Directory */ #ifdef PMAP_PAE_COMP /* XXX only need 32 bytes (easier for now) */ IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree); #endif IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree); /* * Allocate KSTACK. Leave a guard page between IdlePTD and * proc0kstack, to control stack overflow for thread0 and * prevent corruption of the page table. We leak the guard * physical memory due to 1:1 mappings. */ allocpages(1, &physfree); proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree); /* vm86/bios stack */ vm86phystk = allocpages(1, &physfree); /* pgtable + ext + IOPAGES */ vm86paddr = vm86pa = allocpages(3, &physfree); /* Install page tables into PTD. Page table page 1 is wasted. */ for (a = 0; a < NKPT; a++) IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M; #ifdef PMAP_PAE_COMP /* PAE install PTD pointers into PDPT */ for (a = 0; a < NPGPTD; a++) IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V; #endif /* * Install recursive mapping for kernel page tables into * itself. */ for (a = 0; a < NPGPTD; a++) IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V | PG_RW; /* * Initialize page table pages mapping physical address zero * through the (physical) end of the kernel. Many of these * pages must be reserved, and we reserve them all and map * them linearly for convenience. We do this even if we've * enabled PSE above; we'll just switch the corresponding * kernel PDEs before we turn on paging. * * This and all other page table entries allow read and write * access for various reasons. Kernel mappings never have any * access restrictions. */ pmap_cold_mapident(0, atop(NBPDR) * LOWPTDI); pmap_cold_map(0, NBPDR * LOWPTDI, atop(NBPDR) * LOWPTDI); pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE)); /* Map page table directory */ #ifdef PMAP_PAE_COMP pmap_cold_mapident((u_long)IdlePDPT, 1); #endif pmap_cold_mapident((u_long)IdlePTD, NPGPTD); /* Map early KPTmap. It is really pmap_cold_mapident. */ pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT); /* Map proc0kstack */ pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES); /* ISA hole already mapped */ pmap_cold_mapident(vm86phystk, 1); pmap_cold_mapident(vm86pa, 3); /* Map page 0 into the vm86 page table */ *(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V; /* ...likewise for the ISA hole for vm86 */ for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0; a < atop(ISA_HOLE_LENGTH); a++, pt++) *pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A | PG_M | PG_V; /* Enable PSE, PGE, VME, and PAE if configured. */ ncr4 = 0; if ((cpu_feature & CPUID_PSE) != 0) { ncr4 |= CR4_PSE; pseflag = PG_PS; /* * Superpage mapping of the kernel text. Existing 4k * page table pages are wasted. */ for (a = KERNBASE; a < KERNend; a += NBPDR) IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M | PG_RW | PG_V; } if ((cpu_feature & CPUID_PGE) != 0) { ncr4 |= CR4_PGE; pgeflag = PG_G; } ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0; #ifdef PMAP_PAE_COMP ncr4 |= CR4_PAE; #endif if (ncr4 != 0) load_cr4(rcr4() | ncr4); /* Now enable paging */ #ifdef PMAP_PAE_COMP cr3 = (u_int)IdlePDPT; if ((cpu_feature & CPUID_PAT) == 0) wbinvd(); #else cr3 = (u_int)IdlePTD; #endif tramp_idleptd = cr3; load_cr3(cr3); load_cr0(rcr0() | CR0_PG); /* * Now running relocated at KERNBASE where the system is * linked to run. */ /* * Remove the lowest part of the double mapping of low memory * to get some null pointer checks. */ __CONCAT(PMTYPE, remap_lower)(false); kernel_vm_end = /* 0 + */ NKPT * NBPDR; #ifdef PMAP_PAE_COMP i386_pmap_VM_NFREEORDER = VM_NFREEORDER_PAE; i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_PAE; i386_pmap_PDRSHIFT = PDRSHIFT_PAE; #else i386_pmap_VM_NFREEORDER = VM_NFREEORDER_NOPAE; i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_NOPAE; i386_pmap_PDRSHIFT = PDRSHIFT_NOPAE; #endif } static void __CONCAT(PMTYPE, set_nx)(void) { #ifdef PMAP_PAE_COMP if ((amd_feature & AMDID_NX) == 0) return; pg_nx = PG_NX; elf32_nxstack = 1; /* EFER.EFER_NXE is set in initializecpu(). */ #endif } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after pmap_cold() created initial * kernel page table and enabled paging, and just syncs the pmap * module with what has already been done. */ static void __CONCAT(PMTYPE, bootstrap)(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct pcpu *pc; u_long res; int i; res = atop(firstaddr - (vm_paddr_t)KERNLOAD); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Initialize the first available kernel virtual address. * However, using "firstaddr" may waste a few pages of the * kernel virtual address space, because pmap_cold() may not * have mapped every physical page that it allocated. * Preferably, pmap_cold() would provide a first unused * virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t)firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = IdlePTD; #ifdef PMAP_PAE_COMP kernel_pmap->pm_pdpt = IdlePDPT; #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ kernel_pmap->pm_stats.resident_count = res; TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Initialize temporary map objects on the current CPU for use * during early boot. * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the boot-time memory test. */ pc = get_pcpu(); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) /* * KPTmap is used by pmap_kextract(). * * KPTmap is first initialized by pmap_cold(). However, that initial * KPTmap can only support NKPT page table pages. Here, a larger * KPTmap is created that can support KVA_PAGES page table pages. */ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), * respectively. */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) SYSMAP(pt_entry_t *, PMAP3, PADDR3, 1) mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; /* * Initialize the PAT MSR if present. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. We assume * that PAT support implies PGE and in reverse, PGE presence * comes with PAT. Both features were added for Pentium Pro. */ pmap_init_pat(); } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; #ifdef PMAP_PAE_COMP if (!pae_mode) return; #else if (pae_mode) return; #endif CPU_FOREACH(i) { pc = pcpu_find(i); mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF | MTX_NEW); pc->pc_copyout_maddr = kva_alloc(ptoa(2)); if (pc->pc_copyout_maddr == 0) panic("unable to allocate non-sleepable copyout KVA"); sx_init(&pc->pc_copyout_slock, "cpslk"); pc->pc_copyout_saddr = kva_alloc(ptoa(2)); if (pc->pc_copyout_saddr == 0) panic("unable to allocate sleepable copyout KVA"); pc->pc_pmap_eh_va = kva_alloc(ptoa(1)); if (pc->pc_pmap_eh_va == 0) panic("unable to allocate pmap_extract_and_hold KVA"); pc->pc_pmap_eh_ptep = (char *)vtopte(pc->pc_pmap_eh_va); /* * Skip if the mappings have already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap_addr1 != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("unable to allocate CMAP KVA"); pc->pc_cmap_pte1 = vtopte(pages); pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); pc->pc_cmap_addr1 = (caddr_t)pages; pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + ptoa(2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * Setup the PAT MSR. */ static void __CONCAT(PMTYPE, init_pat)(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* * Bail if this CPU doesn't implement PAT. * We assume that PAT support implies PGE. */ if ((cpu_feature & CPUID_PAT) == 0) { for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; pat_works = 0; return; } /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ if (cpu_vendor_id == CPU_VENDOR_INTEL && !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) pat_works = 0; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } #ifdef PMAP_PAE_COMP static void * pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif /* * Abuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ static void __CONCAT(PMTYPE, init)(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ PMAP_LOCK(kernel_pmap); for (i = 0; i < NKPT; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + ptoa(i); mpte->ref_count = 1; /* * Collect the page table pages that were replaced by a 2/4MB * page. They are filled with equivalent 4KB page mappings. */ if (pseflag != 0 && KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend && pmap_insert_pt_page(kernel_pmap, mpte, true)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); vm_wire_add(NKPT); /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings supported and enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pseflag == 0) pg_ps_enabled = 0; else if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Calculate the size of the pv head table for superpages. * Handle the possibility that "vm_phys_segs[...].end" is zero. */ pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) / NBPDR + 1; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #ifdef PMAP_PAE_COMP pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_CONTIG | UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif pmap_initialized = 1; pmap_init_trm(); if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } } extern u_long pmap_pde_demotions; extern u_long pmap_pde_mappings; extern u_long pmap_pde_p_failures; extern u_long pmap_pde_promotions; /*************************************************** * Low level helper routines..... ***************************************************/ static boolean_t __CONCAT(PMTYPE, is_valid_memattr)(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= 0 && mode < PAT_INDEX_SIZE && pat_index[(int)mode] >= 0); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ static int __CONCAT(PMTYPE, cache_bits)(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (!pmap_is_valid_memattr(pmap, mode)) panic("Unknown caching mode %d\n", mode); /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } static int pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) { int pat_flag, pat_idx; if ((cpu_feature & CPUID_PAT) == 0) return (0); pat_idx = 0; /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; if ((pte & pat_flag) != 0) pat_idx |= 0x4; if ((pte & PG_NC_PCD) != 0) pat_idx |= 0x2; if ((pte & PG_NC_PWT) != 0) pat_idx |= 0x1; /* See pmap_init_pat(). */ if (pat_works) { if (pat_idx == 4) pat_idx = 0; if (pat_idx == 7) pat_idx = 3; } else { /* XXXKIB */ } return (pat_idx); } static bool __CONCAT(PMTYPE, ps_enabled)(pmap_t pmap __unused) { return (pg_ps_enabled); } /* * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; pde = pmap_pde(kernel_pmap, va); pde_store(pde, newpde); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else /* if ((newpde & PG_G) == 0) */ /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); } #ifdef SMP static void pmap_curcpu_cb_dummy(pmap_t pmap __unused, vm_offset_t addr1 __unused, vm_offset_t addr2 __unused) { } /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ static void pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invlpg(va); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg(*mask, va, pmap, pmap_curcpu_cb_dummy); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) static void pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask, other_cpus; vm_offset_t addr; u_int cpuid; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all_int(pmap); return; } sched_pin(); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg_range(*mask, sva, eva, pmap, pmap_curcpu_cb_dummy); sched_unpin(); } static void pmap_invalidate_all_int(pmap_t pmap) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invltlb(); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invltlb(*mask, pmap, pmap_curcpu_cb_dummy); sched_unpin(); } static void pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t addr1 __unused, vm_offset_t addr2 __unused) { wbinvd(); } static void __CONCAT(PMTYPE, invalidate_cache)(void) { smp_cache_flush(pmap_invalidate_cache_curcpu_cb); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_kernel(void *arg) { struct pde_action *act = arg; pd_entry_t *pde; if (act->store == PCPU_GET(cpuid)) { pde = pmap_pde(kernel_pmap, act->va); pde_store(pde, act->newpde); } } static void pmap_update_pde_user(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pde_store(act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; else active = pmap->pm_active; if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendezvous_barrier, pmap == kernel_pmap ? pmap_update_pde_kernel : pmap_update_pde_user, pmap_update_pde_teardown, &act); } else { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ static void pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap) invlpg(va); } static void pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } static void pmap_invalidate_all_int(pmap_t pmap) { if (pmap == kernel_pmap) invltlb(); } static void __CONCAT(PMTYPE, invalidate_cache)(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); } #endif /* !SMP */ static void __CONCAT(PMTYPE, invalidate_page)(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_int(pmap, va); } static void __CONCAT(PMTYPE, invalidate_range)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_int(pmap, sva, eva); } static void __CONCAT(PMTYPE, invalidate_all)(pmap_t pmap) { pmap_invalidate_all_int(pmap); } static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was * created by a promotion that did not invalidate the 512 or 1024 4KB * page mappings that might exist in the TLB. Consequently, at this * point, the TLB may hold both 4KB and 2- or 4MB page mappings for * the address range [va, va + NBPDR). Therefore, the entire range * must be invalidated here. In contrast, when PG_PROMOTED is clear, * the TLB will not hold any 4KB page mappings for the address range * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the * 2- or 4MB page mapping from the TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range_int(pmap, va, va + NBPDR - 1); else pmap_invalidate_page_int(pmap, va); } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ static pt_entry_t * __CONCAT(PMTYPE, pte)(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page_int(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (NULL); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } /* * NB: The sequence of updating a page table followed by accesses to the * corresponding pages is subject to the situation described in the "AMD64 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG * right after modifying the PTE bits is crucial. */ static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } static pt_entry_t * pmap_pte_quick3(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP3 & PG_FRAME) != newpf) { *PMAP3 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP3cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR3); PMAP1changed++; } else #ifdef SMP if (PMAP3cpu != PCPU_GET(cpuid)) { PMAP3cpu = PCPU_GET(cpuid); invlcaddr(PADDR3); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR3 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } static pt_entry_t pmap_pte_ufast(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { pt_entry_t *eh_ptep, pte, *ptep; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde &= PG_FRAME; critical_enter(); eh_ptep = (pt_entry_t *)PCPU_GET(pmap_eh_ptep); if ((*eh_ptep & PG_FRAME) != pde) { *eh_ptep = pde | PG_RW | PG_V | PG_A | PG_M; invlcaddr((void *)PCPU_GET(pmap_eh_va)); } ptep = (pt_entry_t *)PCPU_GET(pmap_eh_va) + (i386_btop(va) & (NPTEPG - 1)); pte = *ptep; critical_exit(); return (pte); } /* * Extract from the kernel page table the physical address that is mapped by * the given virtual address "va". * * This function may be used before pmap_bootstrap() is called. */ static vm_paddr_t __CONCAT(PMTYPE, kextract)(vm_offset_t va) { vm_paddr_t pa; if ((pa = pte_load(&PTD[va >> PDRSHIFT])) & PG_PS) { pa = (pa & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the PDE at * this point! For example, vtopte() must not be used to * access the PTE because it would use the new PDE. It is, * however, safe to use the old PDE because the page table * page is preserved by the promotion. */ pa = KPTmap[i386_btop(va)]; pa = (pa & PG_FRAME) | (va & PAGE_MASK); } return (pa); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ static vm_paddr_t __CONCAT(PMTYPE, extract)(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); else { pte = pmap_pte_ufast(pmap, va, pde); rtval = (pte & PG_FRAME) | (va & PAGE_MASK); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ static vm_page_t __CONCAT(PMTYPE, extract_and_hold)(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte; vm_page_t m; m = NULL; PMAP_LOCK(pmap); pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); } else { pte = pmap_pte_ufast(pmap, va, pde); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) m = PHYS_TO_VM_PAGE(pte & PG_FRAME); } if (m != NULL && !vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ static void __CONCAT(PMTYPE, kenter)(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ static void __CONCAT(PMTYPE, kremove)(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ static vm_offset_t __CONCAT(PMTYPE, map)(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t superpage_offset; pd_entry_t newpde; va = *virt; /* * Does the physical address range's size and alignment permit at * least one superpage mapping to be created? */ superpage_offset = start & PDRMASK; if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of superpage mappings. */ if ((va & PDRMASK) < superpage_offset) va = (va & ~PDRMASK) + superpage_offset; else if ((va & PDRMASK) > superpage_offset) va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; } sva = va; while (start < end) { if ((start & PDRMASK) == 0 && end - start >= NBPDR && pseflag != 0) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); newpde = start | PG_PS | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; } else { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } } pmap_invalidate_range_int(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ static void __CONCAT(PMTYPE, qenter)(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pg_nx | PG_RW | PG_V); } pte++; } if (__predict_false((oldpte & PG_V) != 0)) pmap_invalidate_range_int(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ static void __CONCAT(PMTYPE, qremove)(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range_int(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); } /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * There is not need to invalidate the recursive mapping since * we never instantiate such mapping for the usermode pmaps, * and never remove page table pages from the kernel pmap. * Put page on a list so that it is released since all TLB * shootdown is done. */ MPASS(pmap != kernel_pmap); pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) { pd_entry_t ptepde; vm_page_t mpte; if (pmap == kernel_pmap) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, mpte, free)); } /* * Release a page table page reference after a failed attempt to create a * mapping. */ static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { struct spglist free; SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, mpte, &free)) { /* * Although "va" was never mapped, paging-structure caches * could nonetheless have entries that refer to the freed * page table pages. Invalidate those entries. */ pmap_invalidate_page_int(pmap, va); vm_page_free_pages_toq(&free, true); } } /* * Initialize the pmap for the swapper process. */ static void __CONCAT(PMTYPE, pinit0)(pmap_t pmap) { PMAP_LOCK_INIT(pmap); pmap->pm_pdir = IdlePTD; #ifdef PMAP_PAE_COMP pmap->pm_pdpt = IdlePDPT; #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap_activate_boot(pmap); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ static int __CONCAT(PMTYPE, pinit)(pmap_t pmap) { - vm_page_t m; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); if (pmap->pm_pdir == NULL) return (0); #ifdef PMAP_PAE_COMP pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root.rt_root = 0; } KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD; i++) { - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); - pmap->pm_ptdpg[i] = m; + pmap->pm_ptdpg[i] = vm_page_alloc_noobj(VM_ALLOC_WIRED | + VM_ALLOC_ZERO | VM_ALLOC_WAITOK); #ifdef PMAP_PAE_COMP - pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V; + pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(pmap->pm_ptdpg[i]) | PG_V; #endif } pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD); #ifdef PMAP_PAE_COMP if ((cpu_feature & CPUID_PAT) == 0) { pmap_invalidate_cache_range( trunc_page((vm_offset_t)pmap->pm_pdpt), round_page((vm_offset_t)pmap->pm_pdpt + NPGPTD * sizeof(pdpt_entry_t))); } #endif - for (i = 0; i < NPGPTD; i++) - if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0) - pagezero(pmap->pm_pdir + (i * NPDEPG)); - /* Install the trampoline mapping. */ pmap->pm_pdir[TRPTDI] = PTD[TRPTDI]; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ - if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); vm_wait(NULL); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(m); + m->pindex = ptepindex; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); KASSERT((pmap->pm_pdir[ptepindex] & PG_V) == 0, ("%s: page directory entry %#jx is valid", __func__, (uintmax_t)pmap->pm_pdir[ptepindex])); pmap->pm_pdir[ptepindex] = (pd_entry_t)(ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { u_int ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ static void __CONCAT(PMTYPE, release)(pmap_t pmap) { vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = pmap->pm_ptdpg[i]; #ifdef PMAP_PAE_COMP KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif vm_page_unwire_noq(m); vm_page_free(m); } } /* * grow the number of kernel page table entries, if needed */ static void __CONCAT(PMTYPE, growkernel)(vm_offset_t addr) { vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, NBPDR); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } - nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); - + nkpg->pindex = kernel_vm_end >> PDRSHIFT; nkpt++; - if ((nkpg->flags & PG_ZERO) == 0) - pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(KPTD, kernel_vm_end) = newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; #ifdef PV_STATS extern int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; extern long pv_entry_frees, pv_entry_allocs; extern int pv_entry_spare; #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all_int(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfl(inuse); pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = __CONCAT(PMTYPE, pte)(pmap, va); tpte = *pte; if ((tpte & PG_W) == 0) tpte = pte_load_clear(pte); pmap_pte_release(pte); if ((tpte & PG_W) != 0) continue; KASSERT(tpte != 0, ("pmap_pv_reclaim: pmap %p va %x zero pte", pmap, va)); if ((tpte & PG_G) != 0) pmap_invalidate_page_int(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all_int(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->ref_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire_noq(m); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entries tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ - if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + if (pv_vafree == 0 || + (m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags) { struct md_page *pvh; pv_entry_t pv; bool noreclaim; rw_assert(&pvh_global_lock, RA_WLOCKED); noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; if ((noreclaim && pv_entry_count >= pv_entry_high_water) || (pv = get_pv_entry(pmap, noreclaim)) == NULL) return (false); pv->pv_va = va; pvh = pa_to_pvh(pde & PG_PS_FRAME); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2- or 4MB page mapping. If demotion fails, the * 2- or 4MB page mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2- or 4MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ - if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, - va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | - VM_ALLOC_WIRED)) == NULL) { + if ((oldpde & PG_A) == 0 || + (mpte = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_4mpage(va); pmap_remove_pde(pmap, pde, sva, &free); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } + mpte->pindex = va >> PDRSHIFT; if (pmap != kernel_pmap) { mpte->ref_count = NPTEPG; pmap->pm_stats.resident_count++; } } mptepa = VM_PAGE_TO_PHYS(mpte); /* * If the page mapping is in the kernel's address space, then the * KPTmap can provide access to the page table page. Otherwise, * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (pmap == kernel_pmap) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page_int(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) pmap_fill_ptp(firstpte, newpte); KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * Removes a 2- or 4MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (mpte->valid != 0) pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pmap_kenter_pde(va, newpde); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. */ if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pde: pte page not promoted")); pmap->pm_stats.resident_count--; KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_pde: pte page ref count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, struct spglist *free) { pt_entry_t oldpte; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); KASSERT(oldpte != 0, ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page_int(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt_entry_t *pte; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page_int(pmap, va); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, struct spglist *free) { pt_entry_t *pte; bool anyvalid; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); anyvalid = false; for (pte = pmap_pte_quick(pmap, sva); sva != eva; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated by * pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = true; if (pmap_remove_pte(pmap, pte, sva, free)) break; } return (anyvalid); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ static void __CONCAT(PMTYPE, remove)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { u_int pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; if (pmap_remove_ptes(pmap, sva, pdnxt, &free)) anyvalid = 1; } out: sched_unpin(); if (anyvalid) pmap_invalidate_all_int(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void __CONCAT(PMTYPE, remove_all)(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", pmap, pv->pv_va)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page_int(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_page_t m, mt; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((prot & VM_PROT_WRITE) == 0) { if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } newpde &= ~(PG_RW | PG_M); } #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpde |= pg_nx; #endif if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ static void __CONCAT(PMTYPE, protect)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; boolean_t anychanged, pv_lists_locked; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #ifdef PMAP_PAE_COMP if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; u_int pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = TRUE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all_int( pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* * The large page mapping was * destroyed. */ continue; } } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) pbits |= pg_nx; #endif if (pbits != obits) { #ifdef PMAP_PAE_COMP if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page_int(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all_int(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } #if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page (PTP) to a single 2- or 4MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_offset_t oldpteva; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2- or 4MB page. */ firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ static int __CONCAT(PMTYPE, enter)(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pd_entry_t *pde; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv; va = trunc_page(va); KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) || (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS), ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va)); KASSERT(va < PMAP_TRM_MIN_ADDRESS, ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)", va)); KASSERT(pmap != kernel_pmap || (m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpte |= pg_nx; #endif if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (pmap != kernel_pmap) newpte |= PG_U; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m); goto out; } pde = pmap_pde(pmap, va); if (pmap != kernel_pmap) { /* * va is for UVA. * In the case that a page table page is not resident, * we are creating it here. pmap_allocpte() handles * demotion. */ mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); rv = KERN_RESOURCE_SHORTAGE; goto out; } } else { /* * va is for KVA, so pmap_demote_pde() will never fail * to install a page table page. PG_V is also * asserted by pmap_demote_pde(). */ mpte = NULL; KASSERT(pde != NULL && (*pde & PG_V) != 0, ("KVA %#x invalid pde pdir %#jx", va, (uintmax_t)pmap->pm_pdir[PTDPTDI])); if ((*pde & PG_PS) != 0) pmap_demote_pde(pmap, pde, va); } pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry is not valid, which should not * happen. We should have either allocated the page table * page or demoted the existing mapping above. */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } origpte = *pte; pv = NULL; /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ origpte = pte_load_clear(pte); KASSERT((origpte & PG_FRAME) == opa, ("pmap_enter: unexpected pa update for %#x", va)); if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) { pmap_invalidate_page_int(pmap, va); vm_page_aflag_set(om, PGA_REFERENCED); } pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#x", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } else { /* * Since this mapping is unmanaged, assume that PG_A * is set. */ pmap_invalidate_page_int(pmap, va); } origpte = 0; } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap->pm_stats.resident_count++; } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; } TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); KASSERT((origpte & PG_FRAME) == pa, ("pmap_enter: unexpected pa update for %#x", va)); if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } #ifdef PMAP_PAE_COMP else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } #endif if ((origpte & PG_A) != 0) pmap_invalidate_page_int(pmap, va); } else pte_store_zero(pte, newpte); unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->ref_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); #endif rv = KERN_SUCCESS; out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2 or 4 MB page mapping. Returns * true if successful. Returns false if (1) a mapping already exists at the * specified virtual address or (2) a PV entry cannot be allocated without * reclaiming another PV entry. */ static bool pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t newpde; PMAP_LOCK_ASSERT(pmap, MA_OWNED); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpde |= pg_nx; #endif if (pmap != kernel_pmap) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL) == KERN_SUCCESS); } /* * Returns true if every page table entry in the page table page that maps * the specified kernel virtual address is zero. */ static bool pmap_every_pte_zero(vm_offset_t va) { pt_entry_t *pt_end, *pte; KASSERT((va & PDRMASK) == 0, ("va is misaligned")); pte = vtopte(va); for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { if (*pte != 0) return (false); } return (true); } /* * Tries to create the specified 2 or 4 MB page mapping. Returns KERN_SUCCESS * if the mapping was created, and either KERN_FAILURE or * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if * PMAP_ENTER_NOREPLACE was specified and a mapping already exists at the * specified virtual address. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m) { struct spglist free; pd_entry_t oldpde, *pde; vm_page_t mt; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, ("pmap_enter_pde: cannot create wired user mapping")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_V) != 0) { if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (pmap != kernel_pmap || (oldpde & PG_PS) != 0 || !pmap_every_pte_zero(va))) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldpde & PG_PS) != 0) { /* * If the PDE resulted from a promotion, then a * reserved PT page could be freed. */ (void)pmap_remove_pde(pmap, pde, va, &free); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { if (pmap_remove_ptes(pmap, va, va + NBPDR, &free)) pmap_invalidate_all_int(pmap); } if (pmap != kernel_pmap) { vm_page_free_pages_toq(&free, true); KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); } else { KASSERT(SLIST_EMPTY(&free), ("pmap_enter_pde: freed kernel page table page")); /* * Both pmap_remove_pde() and pmap_remove_ptes() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_pde: trie insert failed"); } } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, newpde, flags)) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ static void __CONCAT(PMTYPE, enter_object)(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pg_ps_enabled && pmap_enter_4mpage(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ static void __CONCAT(PMTYPE, enter_quick)(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t newpte, *pte; KASSERT(pmap != kernel_pmap || !VA_IS_CLEANMAP(va) || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (pmap != kernel_pmap) { u_int ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->ref_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->ref_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } sched_pin(); pte = pmap_pte_quick(pmap, va); if (*pte) { if (mpte != NULL) mpte->ref_count--; sched_unpin(); return (NULL); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) pmap_abort_ptp(pmap, va, mpte); sched_unpin(); return (NULL); } /* * Increment counters */ pmap->pm_stats.resident_count++; newpte = VM_PAGE_TO_PHYS(m) | PG_V | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpte |= pg_nx; #endif if (pmap != kernel_pmap) newpte |= PG_U; pte_store_zero(pte, newpte); sched_unpin(); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ static void * __CONCAT(PMTYPE, kenter_temporary)(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ static void __CONCAT(PMTYPE, object_init_pt)(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; vm_paddr_t pa, ptepa; vm_page_t p; int pat_mode; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if (pg_ps_enabled && (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2/4MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and * "size" is a multiple of 2/4M, adding the PAT setting to * "pa" will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_pde(pmap, addr); if (*pde == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } /* Else continue on if the PDE is already valid. */ addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ static void __CONCAT(PMTYPE, unwire)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t *pde; pt_entry_t *pte; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * Regardless of whether a pde (or pte) is 32 * or 64 bits in size, PG_W is among the least * significant 32 bits. */ atomic_clear_int((u_int *)pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. * * PG_W is among the least significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. Since * current pmap is always the kernel pmap when executing in * kernel, and we do not copy from the kernel pmap to a user * pmap, this optimization is not usable in 4/4G full split i386 * world. */ static void __CONCAT(PMTYPE, copy)(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { pt_entry_t *src_pte, *dst_pte, ptetemp; pd_entry_t srcptepaddr; vm_page_t dstmpte, srcmpte; vm_offset_t addr, end_addr, pdnxt; u_int ptepindex; if (dst_addr != src_addr) return; end_addr = src_addr + len; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { KASSERT(addr < PMAP_TRM_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy the trampoline")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->ref_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = pmap_pte_quick3(src_pmap, addr); while (addr < pdnxt) { ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dstmpte == NULL) goto out; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { pmap_abort_ptp(dst_pmap, addr, dstmpte); goto out; } if (dstmpte->ref_count >= srcmpte->ref_count) break; } addr += PAGE_SIZE; src_pte++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Zero 1 page of virtual memory mapped from a hardware page by the caller. */ static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * Zero the specified hardware page. */ static void __CONCAT(PMTYPE, zero_page)(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); pagezero(pc->pc_cmap_addr2); *cmap_pte2 = 0; /* * Unpin the thread before releasing the lock. Otherwise the thread * could be rescheduled while still bound to the current CPU, only * to unpin itself immediately upon resuming execution. */ sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ static void __CONCAT(PMTYPE, zero_page_area)(vm_page_t m, int off, int size) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page_area: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap_addr2); else bzero(pc->pc_cmap_addr2 + off, size); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Copy 1 specified hardware page to another. */ static void __CONCAT(PMTYPE, copy_page)(vm_page_t src, vm_page_t dst) { pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1) panic("pmap_copy_page: CMAP1 busy"); if (*cmap_pte2) panic("pmap_copy_page: CMAP2 busy"); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | pmap_cache_bits(kernel_pmap, src->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, dst->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } static void __CONCAT(PMTYPE, copy_pages)(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; int cnt; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*cmap_pte2 != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | pmap_cache_bits(kernel_pmap, a_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, b_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); a_cp = pc->pc_cmap_addr1 + a_pg_offset; b_cp = pc->pc_cmap_addr2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ static boolean_t __CONCAT(PMTYPE, page_exists_quick)(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ static int __CONCAT(PMTYPE, page_wired_mappings)(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ static boolean_t __CONCAT(PMTYPE, page_is_mapped)(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ static void __CONCAT(PMTYPE, remove_pages)(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, pc->pc_pmap)); allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = pmap_pte_quick(pmap, pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap->pm_stats.resident_count--; KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_pages: pte page ref count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } sched_unpin(); pmap_invalidate_all_int(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ static boolean_t __CONCAT(PMTYPE, is_modified)(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ static boolean_t __CONCAT(PMTYPE, is_prefaultable)(pmap_t pmap, vm_offset_t addr) { pd_entry_t pde; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = *pmap_pde(pmap, addr); if (pde != 0 && (pde & PG_PS) == 0) rv = pmap_pte_ufast(pmap, addr, pde) == 0; PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ static boolean_t __CONCAT(PMTYPE, is_referenced)(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 4mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ static void __CONCAT(PMTYPE, remove_write)(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page_int(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ static int __CONCAT(PMTYPE, ts_referenced)(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "*pde" is mapping a 2/4MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by either 1024 * or 512 4KB pages, it should not be cleared every * time it is tested. Apply a simple "hash" function * on the physical page number, the virtual superpage * number, and the pmap address to select one 4KB page * out of the 1024 or 512 on which testing the * reference bit will result in clearing that bit. * This function is designed to avoid the selection of * the same 4KB page for every 2- or 4MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { atomic_clear_int((u_int *)pde, PG_A); pmap_invalidate_page_int(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page_int(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ static void __CONCAT(PMTYPE, advise)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va, pdnxt; vm_page_t m; bool anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = false; else { pv_lists_locked = true; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = false; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; if (!pv_lists_locked) { pv_lists_locked = true; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all_int(pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(pdnxt, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldpde & PG_W) == 0) { va = eva; if (va > pdnxt) va = pdnxt; va -= PAGE_SIZE; KASSERT(va >= sva, ("pmap_advise: no address gap")); pte = pmap_pte_quick(pmap, va); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, va, NULL); anychanged = true; } } if (pdnxt > eva) pdnxt = eva; va = pdnxt; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_int((u_int *)pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_int((u_int *)pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == pdnxt) va = sva; } else anychanged = true; continue; maybe_invlrng: if (va != pdnxt) { pmap_invalidate_range_int(pmap, va, sva); va = pdnxt; } } if (va != pdnxt) pmap_invalidate_range_int(pmap, va, sva); } if (anychanged) pmap_invalidate_all_int(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ static void __CONCAT(PMTYPE, clear_modify)(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; /* If oldpde has PG_RW set, then it also has PG_M set. */ if ((oldpde & PG_RW) != 0 && pmap_demote_pde(pmap, pde, va) && (oldpde & PG_W) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pte_quick(pmap, va); /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_M | PG_RW); vm_page_dirty(m); pmap_invalidate_page_int(pmap, va); } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page_int(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~PG_PTE_CACHE; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~PG_PDE_CACHE; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ static void * __CONCAT(PMTYPE, mapdev_attr)(vm_paddr_t pa, vm_size_t size, int mode, int flags) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_page_t m; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = pa & PG_FRAME; if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW) { va = pa + PMAP_MAP_LOW; if ((flags & MAPDEV_SETATTR) == 0) return ((void *)(va + offset)); } else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && (ppim->mode == mode || (flags & MAPDEV_SETATTR) == 0)) return ((void *)(ppim->va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) { if ((flags & MAPDEV_SETATTR) == 0 && pmap_initialized) { m = PHYS_TO_VM_PAGE(pa); if (m != NULL && VM_PAGE_TO_PHYS(m) == pa) { pmap_kenter_attr(va + tmpsize, pa + tmpsize, m->md.pat_mode); continue; } } pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); } pmap_invalidate_range_int(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + size); return ((void *)(va + offset)); } static void __CONCAT(PMTYPE, unmapdev)(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) { pmap_qremove(va, atop(size)); kva_free(va, size); } } /* * Sets the memory attribute for the specified page. */ static void __CONCAT(PMTYPE, page_set_memattr)(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; /* * If "m" is a normal page, flush it from the cache. * See pmap_invalidate_cache_range(). * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m)) return; /* * If page is not mapped by sf buffer, but CPU does not * support self snoop, map the page transient and do * invalidation. In the worst case, whole cache is flushed by * pmap_invalidate_cache_range(). */ if ((cpu_feature & CPUID_SS) == 0) pmap_flush_page(m); } static void __CONCAT(PMTYPE, flush_page)(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; vm_offset_t sva, eva; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_flush_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); sva = (vm_offset_t)pc->pc_cmap_addr2; eva = sva + PAGE_SIZE; /* * Use mfence or sfence despite the ordering implied by * mtx_{un,}lock() because clflush on non-Intel CPUs * and clflushopt are not guaranteed to be ordered by * any other instruction. */ if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) { if (useclflushopt) clflushopt(sva); else clflush(sva); } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } else pmap_invalidate_cache(); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. */ static int __CONCAT(PMTYPE, change_attr)(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde; boolean_t changed; base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses above the recursive map. */ if (base < VM_MIN_KERNEL_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down * 2/4MB pages into 4KB pages if required. */ PMAP_LOCK(kernel_pmap); for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2/4MB page already has * the required memory type, then we need not * demote this page. Just increment tmpva to * the next 2/4MB page frame. */ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_4mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2/4MB * page frame and there is at least 2/4MB left * within the range, then we need not break * down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { PMAP_UNLOCK(kernel_pmap); return (ENOMEM); } } pte = vtopte(tmpva); if (*pte == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } tmpva += PAGE_SIZE; } PMAP_UNLOCK(kernel_pmap); /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); changed = TRUE; } tmpva = trunc_4mpage(tmpva) + NBPDR; } else { pte = vtopte(tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); changed = TRUE; } tmpva += PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range_int(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva); } return (0); } /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can * find other mappings. */ static int __CONCAT(PMTYPE, mincore)(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { pd_entry_t pde; pt_entry_t pte; vm_paddr_t pa; int val; PMAP_LOCK(pmap); pde = *pmap_pde(pmap, addr); if (pde != 0) { if ((pde & PG_PS) != 0) { pte = pde; /* Compute the physical address of the 4KB page. */ pa = ((pde & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_PSIND(1); } else { pte = pmap_pte_ufast(pmap, addr, pde); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { *pap = pa; } PMAP_UNLOCK(pmap); return (val); } static void __CONCAT(PMTYPE, activate)(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif #ifdef PMAP_PAE_COMP cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; PCPU_SET(curpmap, pmap); critical_exit(); } static void __CONCAT(PMTYPE, activate_boot)(pmap_t pmap) { u_int cpuid; cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ static void __CONCAT(PMTYPE, align_superpage)(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } static vm_offset_t __CONCAT(PMTYPE, quick_enter_page)(vm_page_t m) { vm_offset_t qaddr; pt_entry_t *pte; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy %#jx", (uintmax_t)*pte)); *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(m), 0); invlpg(qaddr); return (qaddr); } static void __CONCAT(PMTYPE, quick_remove_page)(vm_offset_t addr) { vm_offset_t qaddr; pt_entry_t *pte; qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); *pte = 0; critical_exit(); } static vmem_t *pmap_trm_arena; static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS; static int trm_guard = PAGE_SIZE; static int pmap_trm_import(void *unused __unused, vmem_size_t size, int flags, vmem_addr_t *addrp) { vm_page_t m; vmem_addr_t af, addr, prev_addr; pt_entry_t *trm_pte; prev_addr = atomic_load_long(&pmap_trm_arena_last); size = round_page(size) + trm_guard; for (;;) { if (prev_addr + size < prev_addr || prev_addr + size < size || prev_addr + size > PMAP_TRM_MAX_ADDRESS) return (ENOMEM); addr = prev_addr + size; if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr)) break; } prev_addr += trm_guard; trm_pte = PTmap + atop(prev_addr); for (af = prev_addr; af < addr; af += PAGE_SIZE) { - m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | - VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_WAITOK); pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) | PG_M | PG_A | PG_RW | PG_V | pgeflag | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE)); } *addrp = prev_addr; return (0); } void pmap_init_trm(void) { vm_page_t pd_m; TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard); if ((trm_guard & PAGE_MASK) != 0) trm_guard = 0; pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK); vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE); - pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | - VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO); - if ((pd_m->flags & PG_ZERO) == 0) - pmap_zero_page(pd_m); + pd_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_WAITOK | + VM_ALLOC_ZERO); PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, TRUE); } static void * __CONCAT(PMTYPE, trm_alloc)(size_t size, int flags) { vmem_addr_t res; int error; MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0); error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int), 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res); if (error != 0) return (NULL); if ((flags & M_ZERO) != 0) bzero((void *)res, size); return ((void *)res); } static void __CONCAT(PMTYPE, trm_free)(void *addr, size_t size) { vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4)); } static void __CONCAT(PMTYPE, ksetrw)(vm_offset_t va) { *vtopte(va) |= PG_RW; } static void __CONCAT(PMTYPE, remap_lowptdi)(bool enable) { PTD[KPTDI] = enable ? PTD[LOWPTDI] : 0; invltlb_glob(); } static vm_offset_t __CONCAT(PMTYPE, get_map_low)(void) { return (PMAP_MAP_LOW); } static vm_offset_t __CONCAT(PMTYPE, get_vm_maxuser_address)(void) { return (VM_MAXUSER_ADDRESS); } static vm_paddr_t __CONCAT(PMTYPE, pg_frame)(vm_paddr_t pa) { return (pa & PG_FRAME); } static void __CONCAT(PMTYPE, sf_buf_map)(struct sf_buf *sf) { pt_entry_t opte, *ptep; /* * Update the sf_buf's virtual-to-physical mapping, flushing the * virtual address from the TLB. Since the reference count for * the sf_buf's old mapping was zero, that mapping is not * currently in use. Consequently, there is no need to exchange * the old and new PTEs atomically, even under PAE. */ ptep = vtopte(sf->kva); opte = *ptep; *ptep = VM_PAGE_TO_PHYS(sf->m) | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, sf->m->md.pat_mode, 0); /* * Avoid unnecessary TLB invalidations: If the sf_buf's old * virtual-to-physical mapping was not used, then any processor * that has invalidated the sf_buf's virtual address from its TLB * since the last used mapping need not invalidate again. */ #ifdef SMP if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) CPU_ZERO(&sf->cpumask); #else if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) pmap_invalidate_page_int(kernel_pmap, sf->kva); #endif } static void __CONCAT(PMTYPE, cp_slow0_map)(vm_offset_t kaddr, int plen, vm_page_t *ma) { pt_entry_t *pte; int i; for (i = 0, pte = vtopte(kaddr); i < plen; i++, pte++) { *pte = PG_V | PG_RW | PG_A | PG_M | VM_PAGE_TO_PHYS(ma[i]) | pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(ma[i]), FALSE); invlpg(kaddr + ptoa(i)); } } static u_int __CONCAT(PMTYPE, get_kcr3)(void) { #ifdef PMAP_PAE_COMP return ((u_int)IdlePDPT); #else return ((u_int)IdlePTD); #endif } static u_int __CONCAT(PMTYPE, get_cr3)(pmap_t pmap) { #ifdef PMAP_PAE_COMP return ((u_int)vtophys(pmap->pm_pdpt)); #else return ((u_int)vtophys(pmap->pm_pdir)); #endif } static caddr_t __CONCAT(PMTYPE, cmap3)(vm_paddr_t pa, u_int pte_bits) { pt_entry_t *pte; pte = CMAP3; *pte = pa | pte_bits; invltlb(); return (CADDR3); } static void __CONCAT(PMTYPE, basemem_setup)(u_int basemem) { pt_entry_t *pte; int i; /* * Map pages between basemem and ISA_HOLE_START, if any, r/w into * the vm86 page table so that vm86 can scribble on them using * the vm86 map too. XXX: why 2 ways for this and only 1 way for * page 0, at least as initialized here? */ pte = (pt_entry_t *)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; } struct bios16_pmap_handle { pt_entry_t *pte; pd_entry_t *ptd; pt_entry_t orig_ptd; }; static void * __CONCAT(PMTYPE, bios16_enter)(void) { struct bios16_pmap_handle *h; /* * no page table, so create one and install it. */ h = malloc(sizeof(struct bios16_pmap_handle), M_TEMP, M_WAITOK); h->pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); h->ptd = IdlePTD; *h->pte = vm86phystk | PG_RW | PG_V; h->orig_ptd = *h->ptd; *h->ptd = vtophys(h->pte) | PG_RW | PG_V; pmap_invalidate_all_int(kernel_pmap); /* XXX insurance for now */ return (h); } static void __CONCAT(PMTYPE, bios16_leave)(void *arg) { struct bios16_pmap_handle *h; h = arg; *h->ptd = h->orig_ptd; /* remove page table */ /* * XXX only needs to be invlpg(0) but that doesn't work on the 386 */ pmap_invalidate_all_int(kernel_pmap); free(h->pte, M_TEMP); /* ... and free it */ } struct pmap_kernel_map_range { vm_offset_t sva; pt_entry_t attrs; int ptes; int pdes; int pdpes; }; static void sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t eva) { const char *mode; int i, pat_idx; if (eva <= range->sva) return; pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); for (i = 0; i < PAT_INDEX_SIZE; i++) if (pat_index[i] == pat_idx) break; switch (i) { case PAT_WRITE_BACK: mode = "WB"; break; case PAT_WRITE_THROUGH: mode = "WT"; break; case PAT_UNCACHEABLE: mode = "UC"; break; case PAT_UNCACHED: mode = "U-"; break; case PAT_WRITE_PROTECTED: mode = "WP"; break; case PAT_WRITE_COMBINING: mode = "WC"; break; default: printf("%s: unknown PAT mode %#x for range 0x%08x-0x%08x\n", __func__, pat_idx, range->sva, eva); mode = "??"; break; } sbuf_printf(sb, "0x%08x-0x%08x r%c%c%c%c %s %d %d %d\n", range->sva, eva, (range->attrs & PG_RW) != 0 ? 'w' : '-', (range->attrs & pg_nx) != 0 ? '-' : 'x', (range->attrs & PG_U) != 0 ? 'u' : 's', (range->attrs & PG_G) != 0 ? 'g' : '-', mode, range->pdpes, range->pdes, range->ptes); /* Reset to sentinel value. */ range->sva = 0xffffffff; } /* * Determine whether the attributes specified by a page table entry match those * being tracked by the current range. This is not quite as simple as a direct * flag comparison since some PAT modes have multiple representations. */ static bool sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) { pt_entry_t diff, mask; mask = pg_nx | PG_G | PG_RW | PG_U | PG_PDE_CACHE; diff = (range->attrs ^ attrs) & mask; if (diff == 0) return (true); if ((diff & ~PG_PDE_PAT) == 0 && pmap_pat_index(kernel_pmap, range->attrs, true) == pmap_pat_index(kernel_pmap, attrs, true)) return (true); return (false); } static void sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, pt_entry_t attrs) { memset(range, 0, sizeof(*range)); range->sva = va; range->attrs = attrs; } /* * Given a leaf PTE, derive the mapping's attributes. If they do not match * those of the current run, dump the address range and its attributes, and * begin a new run. */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t va, pd_entry_t pde, pt_entry_t pte) { pt_entry_t attrs; attrs = pde & (PG_RW | PG_U | pg_nx); if ((pde & PG_PS) != 0) { attrs |= pde & (PG_G | PG_PDE_CACHE); } else if (pte != 0) { attrs |= pte & pg_nx; attrs &= pg_nx | (pte & (PG_RW | PG_U)); attrs |= pte & (PG_G | PG_PTE_CACHE); /* Canonicalize by always using the PDE PAT bit. */ if ((attrs & PG_PTE_PAT) != 0) attrs ^= PG_PDE_PAT | PG_PTE_PAT; } if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { sysctl_kmaps_dump(sb, range, va); sysctl_kmaps_reinit(range, va, attrs); } } static int __CONCAT(PMTYPE, sysctl_kmaps)(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; pd_entry_t pde; pt_entry_t *pt, pte; vm_offset_t sva; vm_paddr_t pa; int error; u_int i, k; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = &sbuf; sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ range.sva = 0xffffffff; /* * Iterate over the kernel page tables without holding the * kernel pmap lock. Kernel page table pages are never freed, * so at worst we will observe inconsistencies in the output. */ for (sva = 0, i = 0; i < NPTEPG * NPGPTD * NPDEPG ;) { if (i == 0) sbuf_printf(sb, "\nLow PDE:\n"); else if (i == LOWPTDI * NPTEPG) sbuf_printf(sb, "Low PDE dup:\n"); else if (i == PTDPTDI * NPTEPG) sbuf_printf(sb, "Recursive map:\n"); else if (i == KERNPTDI * NPTEPG) sbuf_printf(sb, "Kernel base:\n"); else if (i == TRPTDI * NPTEPG) sbuf_printf(sb, "Trampoline:\n"); pde = IdlePTD[sva >> PDRSHIFT]; if ((pde & PG_V) == 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_dump(sb, &range, sva); sva += NBPDR; i += NPTEPG; continue; } pa = pde & PG_FRAME; if ((pde & PG_PS) != 0) { sysctl_kmaps_check(sb, &range, sva, pde, 0); range.pdes++; sva += NBPDR; i += NPTEPG; continue; } for (pt = vtopte(sva), k = 0; k < NPTEPG; i++, k++, pt++, sva += PAGE_SIZE) { pte = *pt; if ((pte & PG_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); continue; } sysctl_kmaps_check(sb, &range, sva, pde, pte); range.ptes++; } } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } #define PMM(a) \ .pm_##a = __CONCAT(PMTYPE, a), struct pmap_methods __CONCAT(PMTYPE, methods) = { PMM(ksetrw) PMM(remap_lower) PMM(remap_lowptdi) PMM(align_superpage) PMM(quick_enter_page) PMM(quick_remove_page) PMM(trm_alloc) PMM(trm_free) PMM(get_map_low) PMM(get_vm_maxuser_address) PMM(kextract) PMM(pg_frame) PMM(sf_buf_map) PMM(cp_slow0_map) PMM(get_kcr3) PMM(get_cr3) PMM(cmap3) PMM(basemem_setup) PMM(set_nx) PMM(bios16_enter) PMM(bios16_leave) PMM(bootstrap) PMM(is_valid_memattr) PMM(cache_bits) PMM(ps_enabled) PMM(pinit0) PMM(pinit) PMM(activate) PMM(activate_boot) PMM(advise) PMM(clear_modify) PMM(change_attr) PMM(mincore) PMM(copy) PMM(copy_page) PMM(copy_pages) PMM(zero_page) PMM(zero_page_area) PMM(enter) PMM(enter_object) PMM(enter_quick) PMM(kenter_temporary) PMM(object_init_pt) PMM(unwire) PMM(page_exists_quick) PMM(page_wired_mappings) PMM(page_is_mapped) PMM(remove_pages) PMM(is_modified) PMM(is_prefaultable) PMM(is_referenced) PMM(remove_write) PMM(ts_referenced) PMM(mapdev_attr) PMM(unmapdev) PMM(page_set_memattr) PMM(extract) PMM(extract_and_hold) PMM(map) PMM(qenter) PMM(qremove) PMM(release) PMM(remove) PMM(protect) PMM(remove_all) PMM(init) PMM(init_pat) PMM(growkernel) PMM(invalidate_page) PMM(invalidate_range) PMM(invalidate_all) PMM(invalidate_cache) PMM(flush_page) PMM(kenter) PMM(kremove) PMM(sysctl_kmaps) }; diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c index 36316f2bc3ff..d1f2fd2bd9e4 100644 --- a/sys/kern/kern_mbuf.c +++ b/sys/kern/kern_mbuf.c @@ -1,1709 +1,1709 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2005, * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include "opt_kern_tls.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA * Zones. * * Mbuf Clusters (2K, contiguous) are allocated from the Cluster * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the * administrator so desires. * * Mbufs are allocated from a UMA Primary Zone called the Mbuf * Zone. * * Additionally, FreeBSD provides a Packet Zone, which it * configures as a Secondary Zone to the Mbuf Primary Zone, * thus sharing backend Slab kegs with the Mbuf Primary Zone. * * Thus common-case allocations and locking are simplified: * * m_clget() m_getcl() * | | * | .------------>[(Packet Cache)] m_get(), m_gethdr() * | | [ Packet ] | * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] * [ Cluster Zone ] [ Zone ] [ Mbuf Primary Zone ] * | \________ | * [ Cluster Keg ] \ / * | [ Mbuf Keg ] * [ Cluster Slabs ] | * | [ Mbuf Slabs ] * \____________(VM)_________________/ * * * Whenever an object is allocated with uma_zalloc() out of * one of the Zones its _ctor_ function is executed. The same * for any deallocation through uma_zfree() the _dtor_ function * is executed. * * Caches are per-CPU and are filled from the Primary Zone. * * Whenever an object is allocated from the underlying global * memory pool it gets pre-initialized with the _zinit_ functions. * When the Keg's are overfull objects get decommissioned with * _zfini_ functions and free'd back to the global memory pool. * */ int nmbufs; /* limits number of mbufs */ int nmbclusters; /* limits number of mbuf clusters */ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ bool mb_use_ext_pgs = true; /* use M_EXTPG mbufs for sendfile & TLS */ SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN, &mb_use_ext_pgs, 0, "Use unmapped mbufs for sendfile(2) and TLS offload"); static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); static counter_u64_t snd_tag_count; SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, &snd_tag_count, "# of active mbuf send tags"); /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ static void tunable_mbinit(void *dummy) { quad_t realmem; /* * The default limit for all mbuf related memory is 1/2 of all * available kernel memory (physical or kmem). * At most it can be 3/4 of available kernel memory. */ realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); maxmbufmem = realmem / 2; TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); if (maxmbufmem > realmem / 4 * 3) maxmbufmem = realmem / 4 * 3; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); if (nmbclusters == 0) nmbclusters = maxmbufmem / MCLBYTES / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); if (nmbjumbop == 0) nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); if (nmbjumbo9 == 0) nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); if (nmbjumbo16 == 0) nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; /* * We need at least as many mbufs as we have clusters of * the various types added together. */ TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) nmbufs = lmax(maxmbufmem / MSIZE / 5, nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); } SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) { int error, newnmbclusters; newnmbclusters = nmbclusters; error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { if (newnmbclusters > nmbclusters && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbclusters = newnmbclusters; nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); EVENTHANDLER_INVOKE(nmbclusters_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbclusters, 0, sysctl_nmbclusters, "IU", "Maximum number of mbuf clusters allowed"); static int sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbop; newnmbjumbop = nmbjumbop; error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { if (newnmbjumbop > nmbjumbop && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbop = newnmbjumbop; nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbop, 0, sysctl_nmbjumbop, "IU", "Maximum number of mbuf page size jumbo clusters allowed"); static int sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo9; newnmbjumbo9 = nmbjumbo9; error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { if (newnmbjumbo9 > nmbjumbo9 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo9 = newnmbjumbo9; nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", "Maximum number of mbuf 9k jumbo clusters allowed"); static int sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo16; newnmbjumbo16 = nmbjumbo16; error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { if (newnmbjumbo16 > nmbjumbo16 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo16 = newnmbjumbo16; nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", "Maximum number of mbuf 16k jumbo clusters allowed"); static int sysctl_nmbufs(SYSCTL_HANDLER_ARGS) { int error, newnmbufs; newnmbufs = nmbufs; error = sysctl_handle_int(oidp, &newnmbufs, 0, req); if (error == 0 && req->newptr && newnmbufs != nmbufs) { if (newnmbufs > nmbufs) { nmbufs = newnmbufs; nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); EVENTHANDLER_INVOKE(nmbufs_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbufs, 0, sysctl_nmbufs, "IU", "Maximum number of mbufs allowed"); /* * Zones from which we allocate. */ uma_zone_t zone_mbuf; uma_zone_t zone_clust; uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; /* * Local prototypes. */ static int mb_ctor_mbuf(void *, int, void *, int); static int mb_ctor_clust(void *, int, void *, int); static int mb_ctor_pack(void *, int, void *, int); static void mb_dtor_mbuf(void *, int, void *); static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(uma_zone_t, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); _Static_assert(sizeof(struct mbuf) <= MSIZE, "size of mbuf exceeds MSIZE"); /* * Initialize FreeBSD Network buffer allocation. */ static void mbuf_init(void *dummy) { /* * Configure UMA zones for Mbufs, Clusters, and Packets. */ zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET); if (nmbufs > 0) nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); uma_zone_set_maxaction(zone_mbuf, mb_reclaim); zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbclusters > 0) nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); uma_zone_set_maxaction(zone_clust, mb_reclaim); zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); /* Make jumbo frame zone too. Page size, 9k and 16k. */ zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbop > 0) nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); uma_zone_set_maxaction(zone_jumbop, mb_reclaim); zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbo9 > 0) nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbo16 > 0) nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); /* * Hook event handler for low-memory situation, used to * drain protocols and push data back to the caches (UMA * later pushes it back to VM). */ EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); snd_tag_count = counter_u64_alloc(M_WAITOK); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); #ifdef DEBUGNET /* * debugnet makes use of a pre-allocated pool of mbufs and clusters. When * debugnet is configured, we initialize a set of UMA cache zones which return * items from this pool. At panic-time, the regular UMA zone pointers are * overwritten with those of the cache zones so that drivers may allocate and * free mbufs and clusters without attempting to allocate physical memory. * * We keep mbufs and clusters in a pair of mbuf queues. In particular, for * the purpose of caching clusters, we treat them as mbufs. */ static struct mbufq dn_mbufq = { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX }; static struct mbufq dn_clustq = { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX }; static int dn_clsize; static uma_zone_t dn_zone_mbuf; static uma_zone_t dn_zone_clust; static uma_zone_t dn_zone_pack; static struct debugnet_saved_zones { uma_zone_t dsz_mbuf; uma_zone_t dsz_clust; uma_zone_t dsz_pack; uma_zone_t dsz_jumbop; uma_zone_t dsz_jumbo9; uma_zone_t dsz_jumbo16; bool dsz_debugnet_zones_enabled; } dn_saved_zones; static int dn_buf_import(void *arg, void **store, int count, int domain __unused, int flags) { struct mbufq *q; struct mbuf *m; int i; q = arg; for (i = 0; i < count; i++) { m = mbufq_dequeue(q); if (m == NULL) break; trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags); store[i] = m; } KASSERT((flags & M_WAITOK) == 0 || i == count, ("%s: ran out of pre-allocated mbufs", __func__)); return (i); } static void dn_buf_release(void *arg, void **store, int count) { struct mbufq *q; struct mbuf *m; int i; q = arg; for (i = 0; i < count; i++) { m = store[i]; (void)mbufq_enqueue(q, m); } } static int dn_pack_import(void *arg __unused, void **store, int count, int domain __unused, int flags __unused) { struct mbuf *m; void *clust; int i; for (i = 0; i < count; i++) { m = m_get(MT_DATA, M_NOWAIT); if (m == NULL) break; clust = uma_zalloc(dn_zone_clust, M_NOWAIT); if (clust == NULL) { m_free(m); break; } mb_ctor_clust(clust, dn_clsize, m, 0); store[i] = m; } KASSERT((flags & M_WAITOK) == 0 || i == count, ("%s: ran out of pre-allocated mbufs", __func__)); return (i); } static void dn_pack_release(void *arg __unused, void **store, int count) { struct mbuf *m; void *clust; int i; for (i = 0; i < count; i++) { m = store[i]; clust = m->m_ext.ext_buf; uma_zfree(dn_zone_clust, clust); uma_zfree(dn_zone_mbuf, m); } } /* * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy * the corresponding UMA cache zones. */ void debugnet_mbuf_drain(void) { struct mbuf *m; void *item; if (dn_zone_mbuf != NULL) { uma_zdestroy(dn_zone_mbuf); dn_zone_mbuf = NULL; } if (dn_zone_clust != NULL) { uma_zdestroy(dn_zone_clust); dn_zone_clust = NULL; } if (dn_zone_pack != NULL) { uma_zdestroy(dn_zone_pack); dn_zone_pack = NULL; } while ((m = mbufq_dequeue(&dn_mbufq)) != NULL) m_free(m); while ((item = mbufq_dequeue(&dn_clustq)) != NULL) uma_zfree(m_getzone(dn_clsize), item); } /* * Callback invoked immediately prior to starting a debugnet connection. */ void debugnet_mbuf_start(void) { MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled); /* Save the old zone pointers to restore when debugnet is closed. */ dn_saved_zones = (struct debugnet_saved_zones) { .dsz_debugnet_zones_enabled = true, .dsz_mbuf = zone_mbuf, .dsz_clust = zone_clust, .dsz_pack = zone_pack, .dsz_jumbop = zone_jumbop, .dsz_jumbo9 = zone_jumbo9, .dsz_jumbo16 = zone_jumbo16, }; /* * All cluster zones return buffers of the size requested by the * drivers. It's up to the driver to reinitialize the zones if the * MTU of a debugnet-enabled interface changes. */ printf("debugnet: overwriting mbuf zone pointers\n"); zone_mbuf = dn_zone_mbuf; zone_clust = dn_zone_clust; zone_pack = dn_zone_pack; zone_jumbop = dn_zone_clust; zone_jumbo9 = dn_zone_clust; zone_jumbo16 = dn_zone_clust; } /* * Callback invoked when a debugnet connection is closed/finished. */ void debugnet_mbuf_finish(void) { MPASS(dn_saved_zones.dsz_debugnet_zones_enabled); printf("debugnet: restoring mbuf zone pointers\n"); zone_mbuf = dn_saved_zones.dsz_mbuf; zone_clust = dn_saved_zones.dsz_clust; zone_pack = dn_saved_zones.dsz_pack; zone_jumbop = dn_saved_zones.dsz_jumbop; zone_jumbo9 = dn_saved_zones.dsz_jumbo9; zone_jumbo16 = dn_saved_zones.dsz_jumbo16; memset(&dn_saved_zones, 0, sizeof(dn_saved_zones)); } /* * Reinitialize the debugnet mbuf+cluster pool and cache zones. */ void debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize) { struct mbuf *m; void *item; debugnet_mbuf_drain(); dn_clsize = clsize; dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, dn_buf_import, dn_buf_release, &dn_mbufq, UMA_ZONE_NOBUCKET); dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME, clsize, mb_ctor_clust, NULL, NULL, NULL, dn_buf_import, dn_buf_release, &dn_clustq, UMA_ZONE_NOBUCKET); dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME, MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, dn_pack_import, dn_pack_release, NULL, UMA_ZONE_NOBUCKET); while (nmbuf-- > 0) { m = m_get(MT_DATA, M_WAITOK); uma_zfree(dn_zone_mbuf, m); } while (nclust-- > 0) { item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK); uma_zfree(dn_zone_clust, item); } } #endif /* DEBUGNET */ /* * Constructor for Mbuf primary zone. * * The 'arg' pointer points to a mb_args structure which * contains call-specific information required to support the * mbuf allocation API. See mbuf.h. */ static int mb_ctor_mbuf(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error; int flags; short type; args = (struct mb_args *)arg; type = args->type; /* * The mbuf is initialized later. The caller has the * responsibility to set up any MAC labels too. */ if (type == MT_NOINIT) return (0); m = (struct mbuf *)mem; flags = args->flags; MPASS((flags & M_NOFREE) == 0); error = m_init(m, how, type, flags); return (error); } /* * The Mbuf primary zone destructor. */ static void mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; unsigned long flags; m = (struct mbuf *)mem; flags = (unsigned long)arg; KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); KASSERT((flags & 0x1) == 0, ("%s: obsolete MB_DTOR_SKIP passed", __func__)); if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) m_tag_delete_chain(m, NULL); } /* * The Mbuf Packet zone destructor. */ static void mb_dtor_pack(void *mem, int size, void *arg) { struct mbuf *m; m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); /* Make sure we've got a clean cluster back. */ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); #if defined(INVARIANTS) && !defined(KMSAN) trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); #endif /* * If there are processes blocked on zone_clust, waiting for pages * to be freed up, cause them to be woken up by draining the * packet zone. We are exposed to a race here (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted(zone_clust)) uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); } /* * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. * * Here the 'arg' pointer points to the Mbuf which we * are configuring cluster storage for. If 'arg' is * empty we allocate just the cluster without setting * the mbuf to it. See mbuf.h. */ static int mb_ctor_clust(void *mem, int size, void *arg, int how) { struct mbuf *m; m = (struct mbuf *)arg; if (m != NULL) { m->m_ext.ext_buf = (char *)mem; m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT; m->m_ext.ext_free = NULL; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = m_gettype(size); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; } return (0); } /* * The Packet secondary zone's init routine, executed on the * object's transition from mbuf keg slab to zone cache. */ static int mb_zinit_pack(void *mem, int size, int how) { struct mbuf *m; m = (struct mbuf *)mem; /* m is virgin. */ if (uma_zalloc_arg(zone_clust, m, how) == NULL || m->m_ext.ext_buf == NULL) return (ENOMEM); m->m_ext.ext_type = EXT_PACKET; /* Override. */ #if defined(INVARIANTS) && !defined(KMSAN) trash_init(m->m_ext.ext_buf, MCLBYTES, how); #endif return (0); } /* * The Packet secondary zone's fini routine, executed on the * object's transition from zone cache to keg slab. */ static void mb_zfini_pack(void *mem, int size) { struct mbuf *m; m = (struct mbuf *)mem; #if defined(INVARIANTS) && !defined(KMSAN) trash_fini(m->m_ext.ext_buf, MCLBYTES); #endif uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); #if defined(INVARIANTS) && !defined(KMSAN) trash_dtor(mem, size, NULL); #endif } /* * The "packet" keg constructor. */ static int mb_ctor_pack(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error, flags; short type; m = (struct mbuf *)mem; args = (struct mb_args *)arg; flags = args->flags; type = args->type; MPASS((flags & M_NOFREE) == 0); #if defined(INVARIANTS) && !defined(KMSAN) trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); #endif error = m_init(m, how, type, flags); /* m_ext is already initialized. */ m->m_data = m->m_ext.ext_buf; m->m_flags = (flags | M_EXT); return (error); } /* * This is the protocol drain routine. Called by UMA whenever any of the * mbuf zones is closed to its limit. * * No locks should be held when this is called. The drain routines have to * presently acquire some locks which raises the possibility of lock order * reversal. */ static void mb_reclaim(uma_zone_t zone __unused, int pending __unused) { struct epoch_tracker et; struct domain *dp; struct protosw *pr; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); NET_EPOCH_ENTER(et); for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) (*pr->pr_drain)(); NET_EPOCH_EXIT(et); } /* * Free "count" units of I/O from an mbuf chain. They could be held * in M_EXTPG or just as a normal mbuf. This code is intended to be * called in an error path (I/O error, closed connection, etc). */ void mb_free_notready(struct mbuf *m, int count) { int i; for (i = 0; i < count && m != NULL; i++) { if ((m->m_flags & M_EXTPG) != 0) { m->m_epg_nrdy--; if (m->m_epg_nrdy != 0) continue; } m = m_free(m); } KASSERT(i == count, ("Removed only %d items from %p", i, m)); } /* * Compress an unmapped mbuf into a simple mbuf when it holds a small * amount of data. This is used as a DOS defense to avoid having * small packets tie up wired pages, an ext_pgs structure, and an * mbuf. Since this converts the existing mbuf in place, it can only * be used if there are no other references to 'm'. */ int mb_unmapped_compress(struct mbuf *m) { volatile u_int *refcnt; char buf[MLEN]; /* * Assert that 'm' does not have a packet header. If 'm' had * a packet header, it would only be able to hold MHLEN bytes * and m_data would have to be initialized differently. */ KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG), ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m)); KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; } if (*refcnt != 1) return (EBUSY); m_copydata(m, 0, m->m_len, buf); /* Free the backing pages. */ m->m_ext.ext_free(m); /* Turn 'm' into a "normal" mbuf. */ m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG); m->m_data = m->m_dat; /* Copy data back into m. */ bcopy(buf, mtod(m, char *), m->m_len); return (0); } /* * These next few routines are used to permit downgrading an unmapped * mbuf to a chain of mapped mbufs. This is used when an interface * doesn't supported unmapped mbufs or if checksums need to be * computed in software. * * Each unmapped mbuf is converted to a chain of mbufs. First, any * TLS header data is stored in a regular mbuf. Second, each page of * unmapped data is stored in an mbuf with an EXT_SFBUF external * cluster. These mbufs use an sf_buf to provide a valid KVA for the * associated physical page. They also hold a reference on the * original M_EXTPG mbuf to ensure the physical page doesn't go away. * Finally, any TLS trailer data is stored in a regular mbuf. * * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF * mbufs. It frees the associated sf_buf and releases its reference * on the original M_EXTPG mbuf. * * _mb_unmapped_to_ext() is a helper function that converts a single * unmapped mbuf into a chain of mbufs. * * mb_unmapped_to_ext() is the public function that walks an mbuf * chain converting any unmapped mbufs to mapped mbufs. It returns * the new chain of unmapped mbufs on success. On failure it frees * the original mbuf chain and returns NULL. */ static void mb_unmapped_free_mext(struct mbuf *m) { struct sf_buf *sf; struct mbuf *old_m; sf = m->m_ext.ext_arg1; sf_buf_free(sf); /* Drop the reference on the backing M_EXTPG mbuf. */ old_m = m->m_ext.ext_arg2; mb_free_extpg(old_m); } static struct mbuf * _mb_unmapped_to_ext(struct mbuf *m) { struct mbuf *m_new, *top, *prev, *mref; struct sf_buf *sf; vm_page_t pg; int i, len, off, pglen, pgoff, seglen, segoff; volatile u_int *refcnt; u_int ref_inc = 0; M_ASSERTEXTPG(m); len = m->m_len; KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p", __func__, m)); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* Skip over any data removed from the front. */ off = mtod(m, vm_offset_t); top = NULL; if (m->m_epg_hdrlen != 0) { if (off >= m->m_epg_hdrlen) { off -= m->m_epg_hdrlen; } else { seglen = m->m_epg_hdrlen - off; segoff = off; seglen = min(seglen, len); off = 0; len -= seglen; m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; m_new->m_len = seglen; prev = top = m_new; memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff], seglen); } } pgoff = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs && len > 0; i++) { pglen = m_epg_pagelen(m, i, pgoff); if (off >= pglen) { off -= pglen; pgoff = 0; continue; } seglen = pglen - off; segoff = pgoff + off; off = 0; seglen = min(seglen, len); len -= seglen; pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; if (top == NULL) { top = prev = m_new; } else { prev->m_next = m_new; prev = m_new; } sf = sf_buf_alloc(pg, SFB_NOWAIT); if (sf == NULL) goto fail; ref_inc++; m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); m_new->m_data += segoff; m_new->m_len = seglen; pgoff = 0; }; if (len != 0) { KASSERT((off + len) <= m->m_epg_trllen, ("off + len > trail (%d + %d > %d)", off, len, m->m_epg_trllen)); m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; if (top == NULL) top = m_new; else prev->m_next = m_new; m_new->m_len = len; memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len); } if (ref_inc != 0) { /* * Obtain an additional reference on the old mbuf for * each created EXT_SFBUF mbuf. They will be dropped * in mb_unmapped_free_mext(). */ if (*refcnt == 1) *refcnt += ref_inc; else atomic_add_int(refcnt, ref_inc); } m_free(m); return (top); fail: if (ref_inc != 0) { /* * Obtain an additional reference on the old mbuf for * each created EXT_SFBUF mbuf. They will be * immediately dropped when these mbufs are freed * below. */ if (*refcnt == 1) *refcnt += ref_inc; else atomic_add_int(refcnt, ref_inc); } m_free(m); m_freem(top); return (NULL); } struct mbuf * mb_unmapped_to_ext(struct mbuf *top) { struct mbuf *m, *next, *prev = NULL; prev = NULL; for (m = top; m != NULL; m = next) { /* m might be freed, so cache the next pointer. */ next = m->m_next; if (m->m_flags & M_EXTPG) { if (prev != NULL) { /* * Remove 'm' from the new chain so * that the 'top' chain terminates * before 'm' in case 'top' is freed * due to an error. */ prev->m_next = NULL; } m = _mb_unmapped_to_ext(m); if (m == NULL) { m_freem(top); m_freem(next); return (NULL); } if (prev == NULL) { top = m; } else { prev->m_next = m; } /* * Replaced one mbuf with a chain, so we must * find the end of chain. */ prev = m_last(m); } else { if (prev != NULL) { prev->m_next = m; } prev = m; } } return (top); } /* * Allocate an empty M_EXTPG mbuf. The ext_free routine is * responsible for freeing any pages backing this mbuf when it is * freed. */ struct mbuf * mb_alloc_ext_pgs(int how, m_ext_free_t ext_free) { struct mbuf *m; m = m_get(how, MT_DATA); if (m == NULL) return (NULL); m->m_epg_npgs = 0; m->m_epg_nrdy = 0; m->m_epg_1st_off = 0; m->m_epg_last_len = 0; m->m_epg_flags = 0; m->m_epg_hdrlen = 0; m->m_epg_trllen = 0; m->m_epg_tls = NULL; m->m_epg_so = NULL; m->m_data = NULL; m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; m->m_ext.ext_size = 0; m->m_ext.ext_free = ext_free; return (m); } /* * Clean up after mbufs with M_EXT storage attached to them if the * reference count hits 1. */ void mb_free_ext(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; int freembuf; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* * Check if the header is embedded in the cluster. It is * important that we can't touch any of the mbuf fields * after we have freed the external storage, since mbuf * could have been embedded in it. For now, the mbufs * embedded into the cluster are always of type EXT_EXTREF, * and for this type we won't free the mref. */ if (m->m_flags & M_NOFREE) { freembuf = 0; KASSERT(m->m_ext.ext_type == EXT_EXTREF || m->m_ext.ext_type == EXT_RXRING, ("%s: no-free mbuf %p has wrong type", __func__, m)); } else freembuf = 1; /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { switch (m->m_ext.ext_type) { case EXT_PACKET: /* The packet zone is special. */ if (*refcnt == 0) *refcnt = 1; uma_zfree(zone_pack, mref); break; case EXT_CLUSTER: uma_zfree(zone_clust, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_JUMBOP: uma_zfree(zone_jumbop, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_JUMBO9: uma_zfree(zone_jumbo9, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_JUMBO16: uma_zfree(zone_jumbo16, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_SFBUF: case EXT_NET_DRV: case EXT_MOD_TYPE: case EXT_DISPOSABLE: KASSERT(mref->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); mref->m_ext.ext_free(mref); m_free_raw(mref); break; case EXT_EXTREF: KASSERT(m->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); m->m_ext.ext_free(m); break; case EXT_RXRING: KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free is set", __func__)); break; default: KASSERT(m->m_ext.ext_type == 0, ("%s: unknown ext_type", __func__)); } } if (freembuf && m != mref) m_free_raw(m); } /* * Clean up after mbufs with M_EXTPG storage attached to them if the * reference count hits 1. */ void mb_free_extpg(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; M_ASSERTEXTPG(m); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { KASSERT(mref->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); mref->m_ext.ext_free(mref); #ifdef KERN_TLS if (mref->m_epg_tls != NULL && !refcount_release_if_not_last(&mref->m_epg_tls->refcount)) ktls_enqueue_to_free(mref); else #endif m_free_raw(mref); } if (m != mref) m_free_raw(m); } /* * Official mbuf(9) allocation KPI for stack and drivers: * * m_get() - a single mbuf without any attachments, sys/mbuf.h. * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. * m_clget() - attach cluster to already allocated mbuf. * m_cljget() - attach jumbo cluster to already allocated mbuf. * m_get2() - allocate minimum mbuf that would fit size argument. * m_getm2() - allocate a chain of mbufs/clusters. * m_extadd() - attach external cluster to mbuf. * * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. * m_freem() - free chain of mbufs. */ int m_clget(struct mbuf *m, int how) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = (char *)NULL; uma_zalloc_arg(zone_clust, m, how); /* * On a cluster allocation failure, drain the packet zone and retry, * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); return (m->m_flags & M_EXT); } /* * m_cljget() is different from m_clget() as it can allocate clusters without * attaching them to an mbuf. In that case the return value is the pointer * to the cluster of the requested size. If an mbuf was specified, it gets * the cluster attached to it and the return value can be safely ignored. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ void * m_cljget(struct mbuf *m, int how, int size) { uma_zone_t zone; void *retval; if (m != NULL) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = NULL; } zone = m_getzone(size); retval = uma_zalloc_arg(zone, m, how); MBUF_PROBE4(m__cljget, m, how, size, retval); return (retval); } /* * m_get2() allocates minimum mbuf that would fit "size" argument. */ struct mbuf * m_get2(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; args.flags = flags; args.type = type; if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) return (uma_zalloc_arg(zone_mbuf, &args, how)); if (size <= MCLBYTES) return (uma_zalloc_arg(zone_pack, &args, how)); if (size > MJUMPAGESIZE) return (NULL); m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); n = uma_zalloc_arg(zone_jumbop, m, how); if (n == NULL) { m_free_raw(m); return (NULL); } return (m); } /* * m_get3() allocates minimum mbuf that would fit "size" argument. * Unlike m_get2() it can allocate clusters up to MJUM16BYTES. */ struct mbuf * m_get3(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size <= MJUMPAGESIZE) return (m_get2(size, how, type, flags)); if (size > MJUM16BYTES) return (NULL); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); if (size <= MJUM9BYTES) zone = zone_jumbo9; else zone = zone_jumbo16; n = uma_zalloc_arg(zone_jumbop, m, how); if (n == NULL) { m_free_raw(m); return (NULL); } return (m); } /* * m_getjcl() returns an mbuf with a cluster of the specified size attached. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ struct mbuf * m_getjcl(int how, short type, int flags, int size) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size == MCLBYTES) return m_getcl(how, type, flags); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); zone = m_getzone(size); n = uma_zalloc_arg(zone, m, how); if (n == NULL) { m_free_raw(m); return (NULL); } MBUF_PROBE5(m__getjcl, how, type, flags, size, m); return (m); } /* * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain * to the existing one and return a pointer to the provided mbuf. */ struct mbuf * m_getm2(struct mbuf *m, int len, int how, short type, int flags) { struct mbuf *mb, *nm = NULL, *mtail = NULL; KASSERT(len >= 0, ("%s: len is < 0", __func__)); /* Validate flags. */ flags &= (M_PKTHDR | M_EOR); /* Packet header mbuf must be first in chain. */ if ((flags & M_PKTHDR) && m != NULL) flags &= ~M_PKTHDR; /* Loop and append maximum sized mbufs to the chain tail. */ while (len > 0) { mb = NULL; if (len > MCLBYTES) { mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR), MJUMPAGESIZE); } if (mb == NULL) { if (len >= MINCLSIZE) mb = m_getcl(how, type, (flags & M_PKTHDR)); else if (flags & M_PKTHDR) mb = m_gethdr(how, type); else mb = m_get(how, type); /* * Fail the whole operation if one mbuf can't be * allocated. */ if (mb == NULL) { m_freem(nm); return (NULL); } } /* Book keeping. */ len -= M_SIZE(mb); if (mtail != NULL) mtail->m_next = mb; else nm = mb; mtail = mb; flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ } if (flags & M_EOR) mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ /* If mbuf was supplied, append new chain to the end of it. */ if (m != NULL) { for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) ; mtail->m_next = nm; mtail->m_flags &= ~M_EOR; } else m = nm; return (m); } /*- * Configure a provided mbuf to refer to the provided external storage * buffer and setup a reference count for said buffer. * * Arguments: * mb The existing mbuf to which to attach the provided buffer. * buf The address of the provided external storage buffer. * size The size of the provided buffer. * freef A pointer to a routine that is responsible for freeing the * provided external storage buffer. * args A pointer to an argument structure (of any type) to be passed * to the provided freef routine (may be NULL). * flags Any other flags to be passed to the provided mbuf. * type The type that the external storage buffer should be * labeled with. * * Returns: * Nothing. */ void m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, void *arg1, void *arg2, int flags, int type) { KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); mb->m_flags |= (M_EXT | flags); mb->m_ext.ext_buf = buf; mb->m_data = mb->m_ext.ext_buf; mb->m_ext.ext_size = size; mb->m_ext.ext_free = freef; mb->m_ext.ext_arg1 = arg1; mb->m_ext.ext_arg2 = arg2; mb->m_ext.ext_type = type; if (type != EXT_EXTREF) { mb->m_ext.ext_count = 1; mb->m_ext.ext_flags = EXT_FLAG_EMBREF; } else mb->m_ext.ext_flags = 0; } /* * Free an entire chain of mbufs and associated external buffers, if * applicable. */ void m_freem(struct mbuf *mb) { MBUF_PROBE1(m__freem, mb); while (mb != NULL) mb = m_free(mb); } /* * Temporary primitive to allow freeing without going through m_free. */ void m_free_raw(struct mbuf *mb) { uma_zfree(zone_mbuf, mb); } int m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **mstp) { if (ifp->if_snd_tag_alloc == NULL) return (EOPNOTSUPP); return (ifp->if_snd_tag_alloc(ifp, params, mstp)); } void m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp, const struct if_snd_tag_sw *sw) { if_ref(ifp); mst->ifp = ifp; refcount_init(&mst->refcount, 1); mst->sw = sw; counter_u64_add(snd_tag_count, 1); } void m_snd_tag_destroy(struct m_snd_tag *mst) { struct ifnet *ifp; ifp = mst->ifp; mst->sw->snd_tag_free(mst); if_rele(ifp); counter_u64_add(snd_tag_count, -1); } /* * Allocate an mbuf with anonymous external pages. */ struct mbuf * mb_alloc_ext_plus_pages(int len, int how) { struct mbuf *m; vm_page_t pg; int i, npgs; m = mb_alloc_ext_pgs(how, mb_free_mext_pgs); if (m == NULL) return (NULL); m->m_epg_flags |= EPG_FLAG_ANON; npgs = howmany(len, PAGE_SIZE); for (i = 0; i < npgs; i++) { do { - pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED); + pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP | + VM_ALLOC_WIRED); if (pg == NULL) { if (how == M_NOWAIT) { m->m_epg_npgs = i; m_free(m); return (NULL); } vm_wait(NULL); } } while (pg == NULL); m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg); } m->m_epg_npgs = npgs; return (m); } /* * Copy the data in the mbuf chain to a chain of mbufs with anonymous external * unmapped pages. * len is the length of data in the input mbuf chain. * mlen is the maximum number of bytes put into each ext_page mbuf. */ struct mbuf * mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how, struct mbuf **mlast) { struct mbuf *m, *mout; char *pgpos, *mbpos; int i, mblen, mbufsiz, pglen, xfer; if (len == 0) return (NULL); mbufsiz = min(mlen, len); m = mout = mb_alloc_ext_plus_pages(mbufsiz, how); if (m == NULL) return (m); pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]); pglen = PAGE_SIZE; mblen = 0; i = 0; do { if (pglen == 0) { if (++i == m->m_epg_npgs) { m->m_epg_last_len = PAGE_SIZE; mbufsiz = min(mlen, len); m->m_next = mb_alloc_ext_plus_pages(mbufsiz, how); m = m->m_next; if (m == NULL) { m_freem(mout); return (m); } i = 0; } pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]); pglen = PAGE_SIZE; } while (mblen == 0) { if (mp == NULL) { m_freem(mout); return (NULL); } KASSERT((mp->m_flags & M_EXTPG) == 0, ("mb_copym_ext_pgs: ext_pgs input mbuf")); mbpos = mtod(mp, char *); mblen = mp->m_len; mp = mp->m_next; } xfer = min(mblen, pglen); memcpy(pgpos, mbpos, xfer); pgpos += xfer; mbpos += xfer; pglen -= xfer; mblen -= xfer; len -= xfer; m->m_len += xfer; } while (len > 0); m->m_epg_last_len = PAGE_SIZE - pglen; if (mlast != NULL) *mlast = m; return (mout); } diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index bde9fca97f50..f6179592fb11 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -1,2723 +1,2719 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014-2019 Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) #include #endif #include #include #include #ifdef RSS #include #include #endif #include #include #if defined(INET) || defined(INET6) #include #include #endif #include #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include #include #include struct ktls_wq { struct mtx mtx; STAILQ_HEAD(, mbuf) m_head; STAILQ_HEAD(, socket) so_head; bool running; int lastallocfail; } __aligned(CACHE_LINE_SIZE); struct ktls_alloc_thread { uint64_t wakeups; uint64_t allocs; struct thread *td; int running; }; struct ktls_domain_info { int count; int cpu[MAXCPU]; struct ktls_alloc_thread alloc_td; }; struct ktls_domain_info ktls_domains[MAXMEMDOM]; static struct ktls_wq *ktls_wq; static struct proc *ktls_proc; static uma_zone_t ktls_session_zone; static uma_zone_t ktls_buffer_zone; static uint16_t ktls_cpuid_lookup[MAXCPU]; static int ktls_init_state; static struct sx ktls_init_lock; SX_SYSINIT(ktls_init_lock, &ktls_init_lock, "ktls init"); SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Kernel TLS offload"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Kernel TLS offload stats"); #ifdef RSS static int ktls_bind_threads = 1; #else static int ktls_bind_threads; #endif SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN, &ktls_bind_threads, 0, "Bind crypto threads to cores (1) or cores and domains (2) at boot"); static u_int ktls_maxlen = 16384; SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN, &ktls_maxlen, 0, "Maximum TLS record size"); static int ktls_number_threads; SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD, &ktls_number_threads, 0, "Number of TLS threads in thread-pool"); unsigned int ktls_ifnet_max_rexmit_pct = 2; SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN, &ktls_ifnet_max_rexmit_pct, 2, "Max percent bytes retransmitted before ifnet TLS is disabled"); static bool ktls_offload_enable; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN, &ktls_offload_enable, 0, "Enable support for kernel TLS offload"); static bool ktls_cbc_enable = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN, &ktls_cbc_enable, 1, "Enable Support of AES-CBC crypto for kernel TLS"); static bool ktls_sw_buffer_cache = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN, &ktls_sw_buffer_cache, 1, "Enable caching of output buffers for SW encryption"); static int ktls_max_alloc = 128; SYSCTL_INT(_kern_ipc_tls, OID_AUTO, max_alloc, CTLFLAG_RWTUN, &ktls_max_alloc, 128, "Max number of 16k buffers to allocate in thread context"); static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active); SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD, &ktls_tasks_active, "Number of active tasks"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_pending); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_pending, CTLFLAG_RD, &ktls_cnt_tx_pending, "Number of TLS 1.0 records waiting for earlier TLS records"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD, &ktls_cnt_tx_queued, "Number of TLS records in queue to tasks for SW encryption"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD, &ktls_cnt_rx_queued, "Number of TLS sockets in queue to tasks for SW decryption"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_total); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total, CTLFLAG_RD, &ktls_offload_total, "Total successful TLS setups (parameters set)"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls, CTLFLAG_RD, &ktls_offload_enable_calls, "Total number of TLS enable calls made"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_active); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD, &ktls_offload_active, "Total Active TLS sessions"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD, &ktls_offload_corrupted_records, "Total corrupted TLS records received"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD, &ktls_offload_failed_crypto, "Total TLS crypto failures"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD, &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD, &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD, &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD, &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD, &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Software TLS session stats"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Hardware (ifnet) TLS session stats"); #ifdef TCP_OFFLOAD SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TOE TLS session stats"); #endif static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc, "Active number of software TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm, "Active number of software TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_sw_chacha20, "Active number of software TLS sessions using Chacha20-Poly1305"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD, &ktls_ifnet_cbc, "Active number of ifnet TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD, &ktls_ifnet_gcm, "Active number of ifnet TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_ifnet_chacha20, "Active number of ifnet TLS sessions using Chacha20-Poly1305"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD, &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD, &ktls_ifnet_reset_dropped, "TLS sessions dropped after failing to update ifnet send tag"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD, &ktls_ifnet_reset_failed, "TLS sessions that failed to allocate a new ifnet send tag"); static int ktls_ifnet_permitted; SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN, &ktls_ifnet_permitted, 1, "Whether to permit hardware (ifnet) TLS sessions"); #ifdef TCP_OFFLOAD static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD, &ktls_toe_cbc, "Active number of TOE TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD, &ktls_toe_gcm, "Active number of TOE TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_toe_chacha20, "Active number of TOE TLS sessions using Chacha20-Poly1305"); #endif static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS"); static void ktls_cleanup(struct ktls_session *tls); #if defined(INET) || defined(INET6) static void ktls_reset_send_tag(void *context, int pending); #endif static void ktls_work_thread(void *ctx); static void ktls_alloc_thread(void *ctx); #if defined(INET) || defined(INET6) static u_int ktls_get_cpu(struct socket *so) { struct inpcb *inp; #ifdef NUMA struct ktls_domain_info *di; #endif u_int cpuid; inp = sotoinpcb(so); #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid != NETISR_CPUID_NONE) return (cpuid); #endif /* * Just use the flowid to shard connections in a repeatable * fashion. Note that TLS 1.0 sessions rely on the * serialization provided by having the same connection use * the same queue. */ #ifdef NUMA if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) { di = &ktls_domains[inp->inp_numa_domain]; cpuid = di->cpu[inp->inp_flowid % di->count]; } else #endif cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads]; return (cpuid); } #endif static int ktls_buffer_import(void *arg, void **store, int count, int domain, int flags) { vm_page_t m; int i; KASSERT((ktls_maxlen & PAGE_MASK) == 0, ("%s: ktls max length %d is not page size-aligned", __func__, ktls_maxlen)); for (i = 0; i < count; i++) { m = vm_page_alloc_contig_domain(NULL, 0, domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_NODUMP | malloc2vm_flags(flags), atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if (m == NULL) break; store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); } return (i); } static void ktls_buffer_release(void *arg __unused, void **store, int count) { vm_page_t m; int i, j; for (i = 0; i < count; i++) { m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); for (j = 0; j < atop(ktls_maxlen); j++) { (void)vm_page_unwire_noq(m + j); vm_page_free(m + j); } } } static void ktls_free_mext_contig(struct mbuf *m) { M_ASSERTEXTPG(m); uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0])); } static int ktls_init(void) { struct thread *td; struct pcpu *pc; int count, domain, error, i; ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS, M_WAITOK | M_ZERO); ktls_session_zone = uma_zcreate("ktls_session", sizeof(struct ktls_session), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); if (ktls_sw_buffer_cache) { ktls_buffer_zone = uma_zcache_create("ktls_buffers", roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL, ktls_buffer_import, ktls_buffer_release, NULL, UMA_ZONE_FIRSTTOUCH); } /* * Initialize the workqueues to run the TLS work. We create a * work queue for each CPU. */ CPU_FOREACH(i) { STAILQ_INIT(&ktls_wq[i].m_head); STAILQ_INIT(&ktls_wq[i].so_head); mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); if (ktls_bind_threads > 1) { pc = pcpu_find(i); domain = pc->pc_domain; count = ktls_domains[domain].count; ktls_domains[domain].cpu[count] = i; ktls_domains[domain].count++; } ktls_cpuid_lookup[ktls_number_threads] = i; ktls_number_threads++; } /* * If we somehow have an empty domain, fall back to choosing * among all KTLS threads. */ if (ktls_bind_threads > 1) { for (i = 0; i < vm_ndomains; i++) { if (ktls_domains[i].count == 0) { ktls_bind_threads = 1; break; } } } /* Start kthreads for each workqueue. */ CPU_FOREACH(i) { error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i], &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i); if (error) { printf("Can't add KTLS thread %d error %d\n", i, error); return (error); } } /* * Start an allocation thread per-domain to perform blocking allocations * of 16k physically contiguous TLS crypto destination buffers. */ if (ktls_sw_buffer_cache) { for (domain = 0; domain < vm_ndomains; domain++) { if (VM_DOMAIN_EMPTY(domain)) continue; if (CPU_EMPTY(&cpuset_domain[domain])) continue; error = kproc_kthread_add(ktls_alloc_thread, &ktls_domains[domain], &ktls_proc, &ktls_domains[domain].alloc_td.td, 0, 0, "KTLS", "alloc_%d", domain); if (error) { printf("Can't add KTLS alloc thread %d error %d\n", domain, error); return (error); } } } if (bootverbose) printf("KTLS: Initialized %d threads\n", ktls_number_threads); return (0); } static int ktls_start_kthreads(void) { int error, state; start: state = atomic_load_acq_int(&ktls_init_state); if (__predict_true(state > 0)) return (0); if (state < 0) return (ENXIO); sx_xlock(&ktls_init_lock); if (ktls_init_state != 0) { sx_xunlock(&ktls_init_lock); goto start; } error = ktls_init(); if (error == 0) state = 1; else state = -1; atomic_store_rel_int(&ktls_init_state, state); sx_xunlock(&ktls_init_lock); return (error); } #if defined(INET) || defined(INET6) static int ktls_create_session(struct socket *so, struct tls_enable *en, struct ktls_session **tlsp) { struct ktls_session *tls; int error; /* Only TLS 1.0 - 1.3 are supported. */ if (en->tls_vmajor != TLS_MAJOR_VER_ONE) return (EINVAL); if (en->tls_vminor < TLS_MINOR_VER_ZERO || en->tls_vminor > TLS_MINOR_VER_THREE) return (EINVAL); if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv)) return (EINVAL); /* All supported algorithms require a cipher key. */ if (en->cipher_key_len == 0) return (EINVAL); /* No flags are currently supported. */ if (en->flags != 0) return (EINVAL); /* Common checks for supported algorithms. */ switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * auth_algorithm isn't used, but permit GMAC values * for compatibility. */ switch (en->auth_algorithm) { case 0: #ifdef COMPAT_FREEBSD12 /* XXX: Really 13.0-current COMPAT. */ case CRYPTO_AES_128_NIST_GMAC: case CRYPTO_AES_192_NIST_GMAC: case CRYPTO_AES_256_NIST_GMAC: #endif break; default: return (EINVAL); } if (en->auth_key_len != 0) return (EINVAL); if ((en->tls_vminor == TLS_MINOR_VER_TWO && en->iv_len != TLS_AEAD_GCM_LEN) || (en->tls_vminor == TLS_MINOR_VER_THREE && en->iv_len != TLS_1_3_GCM_IV_LEN)) return (EINVAL); break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: /* * TLS 1.0 requires an implicit IV. TLS 1.1+ * all use explicit IVs. */ if (en->tls_vminor == TLS_MINOR_VER_ZERO) { if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN) return (EINVAL); break; } /* FALLTHROUGH */ case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: /* Ignore any supplied IV. */ en->iv_len = 0; break; default: return (EINVAL); } if (en->auth_key_len == 0) return (EINVAL); if (en->tls_vminor != TLS_MINOR_VER_ZERO && en->tls_vminor != TLS_MINOR_VER_ONE && en->tls_vminor != TLS_MINOR_VER_TWO) return (EINVAL); break; case CRYPTO_CHACHA20_POLY1305: if (en->auth_algorithm != 0 || en->auth_key_len != 0) return (EINVAL); if (en->tls_vminor != TLS_MINOR_VER_TWO && en->tls_vminor != TLS_MINOR_VER_THREE) return (EINVAL); if (en->iv_len != TLS_CHACHA20_IV_LEN) return (EINVAL); break; default: return (EINVAL); } error = ktls_start_kthreads(); if (error != 0) return (error); tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls->refcount, 1); TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls); tls->wq_index = ktls_get_cpu(so); tls->params.cipher_algorithm = en->cipher_algorithm; tls->params.auth_algorithm = en->auth_algorithm; tls->params.tls_vmajor = en->tls_vmajor; tls->params.tls_vminor = en->tls_vminor; tls->params.flags = en->flags; tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen); /* Set the header and trailer lengths. */ tls->params.tls_hlen = sizeof(struct tls_record_layer); switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte * nonce. TLS 1.3 uses a 12 byte implicit IV. */ if (en->tls_vminor < TLS_MINOR_VER_THREE) tls->params.tls_hlen += sizeof(uint64_t); tls->params.tls_tlen = AES_GMAC_HASH_LEN; tls->params.tls_bs = 1; break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: if (en->tls_vminor == TLS_MINOR_VER_ZERO) { /* Implicit IV, no nonce. */ tls->sequential_records = true; tls->next_seqno = be64dec(en->rec_seq); STAILQ_INIT(&tls->pending_records); } else { tls->params.tls_hlen += AES_BLOCK_LEN; } tls->params.tls_tlen = AES_BLOCK_LEN + SHA1_HASH_LEN; break; case CRYPTO_SHA2_256_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_256_HASH_LEN; break; case CRYPTO_SHA2_384_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_384_HASH_LEN; break; default: panic("invalid hmac"); } tls->params.tls_bs = AES_BLOCK_LEN; break; case CRYPTO_CHACHA20_POLY1305: /* * Chacha20 uses a 12 byte implicit IV. */ tls->params.tls_tlen = POLY1305_HASH_LEN; tls->params.tls_bs = 1; break; default: panic("invalid cipher"); } /* * TLS 1.3 includes optional padding which we do not support, * and also puts the "real" record type at the end of the * encrypted data. */ if (en->tls_vminor == TLS_MINOR_VER_THREE) tls->params.tls_tlen += sizeof(uint8_t); KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN, ("TLS header length too long: %d", tls->params.tls_hlen)); KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN, ("TLS trailer length too long: %d", tls->params.tls_tlen)); if (en->auth_key_len != 0) { tls->params.auth_key_len = en->auth_key_len; tls->params.auth_key = malloc(en->auth_key_len, M_KTLS, M_WAITOK); error = copyin(en->auth_key, tls->params.auth_key, en->auth_key_len); if (error) goto out; } tls->params.cipher_key_len = en->cipher_key_len; tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK); error = copyin(en->cipher_key, tls->params.cipher_key, en->cipher_key_len); if (error) goto out; /* * This holds the implicit portion of the nonce for AEAD * ciphers and the initial implicit IV for TLS 1.0. The * explicit portions of the IV are generated in ktls_frame(). */ if (en->iv_len != 0) { tls->params.iv_len = en->iv_len; error = copyin(en->iv, tls->params.iv, en->iv_len); if (error) goto out; /* * For TLS 1.2 with GCM, generate an 8-byte nonce as a * counter to generate unique explicit IVs. * * Store this counter in the last 8 bytes of the IV * array so that it is 8-byte aligned. */ if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 && en->tls_vminor == TLS_MINOR_VER_TWO) arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0); } *tlsp = tls; return (0); out: ktls_cleanup(tls); return (error); } static struct ktls_session * ktls_clone_session(struct ktls_session *tls) { struct ktls_session *tls_new; tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls_new->refcount, 1); TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_send_tag, tls_new); /* Copy fields from existing session. */ tls_new->params = tls->params; tls_new->wq_index = tls->wq_index; /* Deep copy keys. */ if (tls_new->params.auth_key != NULL) { tls_new->params.auth_key = malloc(tls->params.auth_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.auth_key, tls->params.auth_key, tls->params.auth_key_len); } tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.cipher_key, tls->params.cipher_key, tls->params.cipher_key_len); return (tls_new); } #endif static void ktls_cleanup(struct ktls_session *tls) { counter_u64_add(ktls_offload_active, -1); switch (tls->mode) { case TCP_TLS_MODE_SW: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_sw_chacha20, -1); break; } ktls_ocf_free(tls); break; case TCP_TLS_MODE_IFNET: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_ifnet_chacha20, -1); break; } if (tls->snd_tag != NULL) m_snd_tag_rele(tls->snd_tag); break; #ifdef TCP_OFFLOAD case TCP_TLS_MODE_TOE: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_toe_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_toe_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_toe_chacha20, -1); break; } break; #endif } if (tls->params.auth_key != NULL) { zfree(tls->params.auth_key, M_KTLS); tls->params.auth_key = NULL; tls->params.auth_key_len = 0; } if (tls->params.cipher_key != NULL) { zfree(tls->params.cipher_key, M_KTLS); tls->params.cipher_key = NULL; tls->params.cipher_key_len = 0; } explicit_bzero(tls->params.iv, sizeof(tls->params.iv)); } #if defined(INET) || defined(INET6) #ifdef TCP_OFFLOAD static int ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction) { struct inpcb *inp; struct tcpcb *tp; int error; inp = so->so_pcb; INP_WLOCK(inp); if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return (ECONNRESET); } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); if (!(tp->t_flags & TF_TOE)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = tcp_offload_alloc_tls_session(tp, tls, direction); INP_WUNLOCK(inp); if (error == 0) { tls->mode = TCP_TLS_MODE_TOE; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_toe_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_toe_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_toe_chacha20, 1); break; } } return (error); } #endif /* * Common code used when first enabling ifnet TLS on a connection or * when allocating a new ifnet TLS session due to a routing change. * This function allocates a new TLS send tag on whatever interface * the connection is currently routed over. */ static int ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force, struct m_snd_tag **mstp) { union if_snd_tag_alloc_params params; struct ifnet *ifp; struct nhop_object *nh; struct tcpcb *tp; int error; INP_RLOCK(inp); if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_RUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); /* * Check administrative controls on ifnet TLS to determine if * ifnet TLS should be denied. * * - Always permit 'force' requests. * - ktls_ifnet_permitted == 0: always deny. */ if (!force && ktls_ifnet_permitted == 0) { INP_RUNLOCK(inp); return (ENXIO); } /* * XXX: Use the cached route in the inpcb to find the * interface. This should perhaps instead use * rtalloc1_fib(dst, 0, 0, fibnum). Since KTLS is only * enabled after a connection has completed key negotiation in * userland, the cached route will be present in practice. */ nh = inp->inp_route.ro_nh; if (nh == NULL) { INP_RUNLOCK(inp); return (ENXIO); } ifp = nh->nh_ifp; if_ref(ifp); /* * Allocate a TLS + ratelimit tag if the connection has an * existing pacing rate. */ if (tp->t_pacing_rate != -1 && (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) { params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT; params.tls_rate_limit.inp = inp; params.tls_rate_limit.tls = tls; params.tls_rate_limit.max_rate = tp->t_pacing_rate; } else { params.hdr.type = IF_SND_TAG_TYPE_TLS; params.tls.inp = inp; params.tls.tls = tls; } params.hdr.flowid = inp->inp_flowid; params.hdr.flowtype = inp->inp_flowtype; params.hdr.numa_domain = inp->inp_numa_domain; INP_RUNLOCK(inp); if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) { error = EOPNOTSUPP; goto out; } if (inp->inp_vflag & INP_IPV6) { if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) { error = EOPNOTSUPP; goto out; } } else { if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) { error = EOPNOTSUPP; goto out; } } error = m_snd_tag_alloc(ifp, ¶ms, mstp); out: if_rele(ifp); return (error); } static int ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force) { struct m_snd_tag *mst; int error; error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst); if (error == 0) { tls->mode = TCP_TLS_MODE_IFNET; tls->snd_tag = mst; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_ifnet_chacha20, 1); break; } } return (error); } static int ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction) { int error; error = ktls_ocf_try(so, tls, direction); if (error) return (error); tls->mode = TCP_TLS_MODE_SW; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_sw_chacha20, 1); break; } return (0); } /* * KTLS RX stores data in the socket buffer as a list of TLS records, * where each record is stored as a control message containg the TLS * header followed by data mbufs containing the decrypted data. This * is different from KTLS TX which always uses an mb_ext_pgs mbuf for * both encrypted and decrypted data. TLS records decrypted by a NIC * should be queued to the socket buffer as records, but encrypted * data which needs to be decrypted by software arrives as a stream of * regular mbufs which need to be converted. In addition, there may * already be pending encrypted data in the socket buffer when KTLS RX * is enabled. * * To manage not-yet-decrypted data for KTLS RX, the following scheme * is used: * * - A single chain of NOTREADY mbufs is hung off of sb_mtls. * * - ktls_check_rx checks this chain of mbufs reading the TLS header * from the first mbuf. Once all of the data for that TLS record is * queued, the socket is queued to a worker thread. * * - The worker thread calls ktls_decrypt to decrypt TLS records in * the TLS chain. Each TLS record is detached from the TLS chain, * decrypted, and inserted into the regular socket buffer chain as * record starting with a control message holding the TLS header and * a chain of mbufs holding the encrypted data. */ static void sb_mark_notready(struct sockbuf *sb) { struct mbuf *m; m = sb->sb_mb; sb->sb_mtls = m; sb->sb_mb = NULL; sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; for (; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL", __func__)); KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail", __func__)); KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len", __func__)); m->m_flags |= M_NOTREADY; sb->sb_acc -= m->m_len; sb->sb_tlscc += m->m_len; sb->sb_mtlstail = m; } KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc, ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc, sb->sb_ccc)); } int ktls_enable_rx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; int error; if (!ktls_offload_enable) return (ENOTSUP); if (SOLISTENING(so)) return (EINVAL); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_rcv.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); /* TLS 1.3 is not yet supported. */ if (en->tls_vmajor == TLS_MAJOR_VER_ONE && en->tls_vminor == TLS_MINOR_VER_THREE) return (ENOTSUP); error = ktls_create_session(so, en, &tls); if (error) return (error); #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_RX); if (error) #endif error = ktls_try_sw(so, tls, KTLS_RX); if (error) { ktls_cleanup(tls); return (error); } /* Mark the socket as using TLS offload. */ SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq); so->so_rcv.sb_tls_info = tls; so->so_rcv.sb_flags |= SB_TLS_RX; /* Mark existing data as not ready until it can be decrypted. */ if (tls->mode != TCP_TLS_MODE_TOE) { sb_mark_notready(&so->so_rcv); ktls_check_rx(&so->so_rcv); } SOCKBUF_UNLOCK(&so->so_rcv); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_enable_tx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; struct inpcb *inp; int error; if (!ktls_offload_enable) return (ENOTSUP); if (SOLISTENING(so)) return (EINVAL); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_snd.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); /* TLS requires ext pgs */ if (mb_use_ext_pgs == 0) return (ENXIO); error = ktls_create_session(so, en, &tls); if (error) return (error); /* Prefer TOE -> ifnet TLS -> software TLS. */ #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_TX); if (error) #endif error = ktls_try_ifnet(so, tls, false); if (error) error = ktls_try_sw(so, tls, KTLS_TX); if (error) { ktls_cleanup(tls); return (error); } error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); if (error) { ktls_cleanup(tls); return (error); } /* * Write lock the INP when setting sb_tls_info so that * routines in tcp_ratelimit.c can read sb_tls_info while * holding the INP lock. */ inp = so->so_pcb; INP_WLOCK(inp); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_seqno = be64dec(en->rec_seq); so->so_snd.sb_tls_info = tls; if (tls->mode != TCP_TLS_MODE_SW) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); SOCK_IO_SEND_UNLOCK(so); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_get_rx_mode(struct socket *so, int *modep) { struct ktls_session *tls; struct inpcb *inp; if (SOLISTENING(so)) return (EINVAL); inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCK_RECVBUF_LOCK(so); tls = so->so_rcv.sb_tls_info; if (tls == NULL) *modep = TCP_TLS_MODE_NONE; else *modep = tls->mode; SOCK_RECVBUF_UNLOCK(so); return (0); } int ktls_get_tx_mode(struct socket *so, int *modep) { struct ktls_session *tls; struct inpcb *inp; if (SOLISTENING(so)) return (EINVAL); inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCK_SENDBUF_LOCK(so); tls = so->so_snd.sb_tls_info; if (tls == NULL) *modep = TCP_TLS_MODE_NONE; else *modep = tls->mode; SOCK_SENDBUF_UNLOCK(so); return (0); } /* * Switch between SW and ifnet TLS sessions as requested. */ int ktls_set_tx_mode(struct socket *so, int mode) { struct ktls_session *tls, *tls_new; struct inpcb *inp; int error; if (SOLISTENING(so)) return (EINVAL); switch (mode) { case TCP_TLS_MODE_SW: case TCP_TLS_MODE_IFNET: break; default: return (EINVAL); } inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(&so->so_snd); tls = so->so_snd.sb_tls_info; if (tls == NULL) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } if (tls->mode == mode) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } tls = ktls_hold(tls); SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); tls_new = ktls_clone_session(tls); if (mode == TCP_TLS_MODE_IFNET) error = ktls_try_ifnet(so, tls_new, true); else error = ktls_try_sw(so, tls_new, KTLS_TX); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } /* * If we raced with another session change, keep the existing * session. */ if (tls != so->so_snd.sb_tls_info) { counter_u64_add(ktls_switch_failed, 1); SOCK_IO_SEND_UNLOCK(so); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (EBUSY); } SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_info = tls_new; if (tls_new->mode != TCP_TLS_MODE_SW) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); SOCK_IO_SEND_UNLOCK(so); /* * Drop two references on 'tls'. The first is for the * ktls_hold() above. The second drops the reference from the * socket buffer. */ KASSERT(tls->refcount >= 2, ("too few references on old session")); ktls_free(tls); ktls_free(tls); if (mode == TCP_TLS_MODE_IFNET) counter_u64_add(ktls_switch_to_ifnet, 1); else counter_u64_add(ktls_switch_to_sw, 1); INP_WLOCK(inp); return (0); } /* * Try to allocate a new TLS send tag. This task is scheduled when * ip_output detects a route change while trying to transmit a packet * holding a TLS record. If a new tag is allocated, replace the tag * in the TLS session. Subsequent packets on the connection will use * the new tag. If a new tag cannot be allocated, drop the * connection. */ static void ktls_reset_send_tag(void *context, int pending) { struct epoch_tracker et; struct ktls_session *tls; struct m_snd_tag *old, *new; struct inpcb *inp; struct tcpcb *tp; int error; MPASS(pending == 1); tls = context; inp = tls->inp; /* * Free the old tag first before allocating a new one. * ip[6]_output_send() will treat a NULL send tag the same as * an ifp mismatch and drop packets until a new tag is * allocated. * * Write-lock the INP when changing tls->snd_tag since * ip[6]_output_send() holds a read-lock when reading the * pointer. */ INP_WLOCK(inp); old = tls->snd_tag; tls->snd_tag = NULL; INP_WUNLOCK(inp); if (old != NULL) m_snd_tag_rele(old); error = ktls_alloc_snd_tag(inp, tls, true, &new); if (error == 0) { INP_WLOCK(inp); tls->snd_tag = new; mtx_pool_lock(mtxpool_sleep, tls); tls->reset_pending = false; mtx_pool_unlock(mtxpool_sleep, tls); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset, 1); /* * XXX: Should we kick tcp_output explicitly now that * the send tag is fixed or just rely on timers? */ } else { NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (!in_pcbrele_wlocked(inp)) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); CURVNET_SET(tp->t_vnet); tp = tcp_drop(tp, ECONNABORTED); CURVNET_RESTORE(); if (tp != NULL) INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset_dropped, 1); } else INP_WUNLOCK(inp); } NET_EPOCH_EXIT(et); counter_u64_add(ktls_ifnet_reset_failed, 1); /* * Leave reset_pending true to avoid future tasks while * the socket goes away. */ } ktls_free(tls); } int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls) { if (inp == NULL) return (ENOBUFS); INP_LOCK_ASSERT(inp); /* * See if we should schedule a task to update the send tag for * this session. */ mtx_pool_lock(mtxpool_sleep, tls); if (!tls->reset_pending) { (void) ktls_hold(tls); in_pcbref(inp); tls->inp = inp; tls->reset_pending = true; taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task); } mtx_pool_unlock(mtxpool_sleep, tls); return (ENOBUFS); } #ifdef RATELIMIT int ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; /* Can't get to the inp, but it should be locked. */ /* INP_LOCK_ASSERT(inp); */ MPASS(tls->mode == TCP_TLS_MODE_IFNET); if (tls->snd_tag == NULL) { /* * Resetting send tag, ignore this change. The * pending reset may or may not see this updated rate * in the tcpcb. If it doesn't, we will just lose * this rate change. */ return (0); } MPASS(tls->snd_tag != NULL); MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT); mst = tls->snd_tag; return (mst->sw->snd_tag_modify(mst, ¶ms)); } #endif #endif void ktls_destroy(struct ktls_session *tls) { if (tls->sequential_records) { struct mbuf *m, *n; int page_count; STAILQ_FOREACH_SAFE(m, &tls->pending_records, m_epg_stailq, n) { page_count = m->m_epg_enc_cnt; while (page_count > 0) { KASSERT(page_count >= m->m_epg_nrdy, ("%s: too few pages", __func__)); page_count -= m->m_epg_nrdy; m = m_free(m); } } } ktls_cleanup(tls); uma_zfree(ktls_session_zone, tls); } void ktls_seq(struct sockbuf *sb, struct mbuf *m) { for (; m != NULL; m = m->m_next) { KASSERT((m->m_flags & M_EXTPG) != 0, ("ktls_seq: mapped mbuf %p", m)); m->m_epg_seqno = sb->sb_tls_seqno; sb->sb_tls_seqno++; } } /* * Add TLS framing (headers and trailers) to a chain of mbufs. Each * mbuf in the chain must be an unmapped mbuf. The payload of the * mbuf must be populated with the payload of each TLS record. * * The record_type argument specifies the TLS record type used when * populating the TLS header. * * The enq_count argument on return is set to the number of pages of * payload data for this entire chain that need to be encrypted via SW * encryption. The returned value should be passed to ktls_enqueue * when scheduling encryption of this chain of mbufs. To handle the * special case of empty fragments for TLS 1.0 sessions, an empty * fragment counts as one page. */ void ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt, uint8_t record_type) { struct tls_record_layer *tlshdr; struct mbuf *m; uint64_t *noncep; uint16_t tls_len; int maxlen; maxlen = tls->params.max_frame_len; *enq_cnt = 0; for (m = top; m != NULL; m = m->m_next) { /* * All mbufs in the chain should be TLS records whose * payload does not exceed the maximum frame length. * * Empty TLS records are permitted when using CBC. */ KASSERT(m->m_len <= maxlen && (tls->params.cipher_algorithm == CRYPTO_AES_CBC ? m->m_len >= 0 : m->m_len > 0), ("ktls_frame: m %p len %d\n", m, m->m_len)); /* * TLS frames require unmapped mbufs to store session * info. */ KASSERT((m->m_flags & M_EXTPG) != 0, ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top)); tls_len = m->m_len; /* Save a reference to the session. */ m->m_epg_tls = ktls_hold(tls); m->m_epg_hdrlen = tls->params.tls_hlen; m->m_epg_trllen = tls->params.tls_tlen; if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) { int bs, delta; /* * AES-CBC pads messages to a multiple of the * block size. Note that the padding is * applied after the digest and the encryption * is done on the "plaintext || mac || padding". * At least one byte of padding is always * present. * * Compute the final trailer length assuming * at most one block of padding. * tls->params.tls_tlen is the maximum * possible trailer length (padding + digest). * delta holds the number of excess padding * bytes if the maximum were used. Those * extra bytes are removed. */ bs = tls->params.tls_bs; delta = (tls_len + tls->params.tls_tlen) & (bs - 1); m->m_epg_trllen -= delta; } m->m_len += m->m_epg_hdrlen + m->m_epg_trllen; /* Populate the TLS header. */ tlshdr = (void *)m->m_epg_hdr; tlshdr->tls_vmajor = tls->params.tls_vmajor; /* * TLS 1.3 masquarades as TLS 1.2 with a record type * of TLS_RLTYPE_APP. */ if (tls->params.tls_vminor == TLS_MINOR_VER_THREE && tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) { tlshdr->tls_vminor = TLS_MINOR_VER_TWO; tlshdr->tls_type = TLS_RLTYPE_APP; /* save the real record type for later */ m->m_epg_record_type = record_type; m->m_epg_trail[0] = record_type; } else { tlshdr->tls_vminor = tls->params.tls_vminor; tlshdr->tls_type = record_type; } tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr)); /* * Store nonces / explicit IVs after the end of the * TLS header. * * For GCM with TLS 1.2, an 8 byte nonce is copied * from the end of the IV. The nonce is then * incremented for use by the next record. * * For CBC, a random nonce is inserted for TLS 1.1+. */ if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 && tls->params.tls_vminor == TLS_MINOR_VER_TWO) { noncep = (uint64_t *)(tls->params.iv + 8); be64enc(tlshdr + 1, *noncep); (*noncep)++; } else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC && tls->params.tls_vminor >= TLS_MINOR_VER_ONE) arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0); /* * When using SW encryption, mark the mbuf not ready. * It will be marked ready via sbready() after the * record has been encrypted. * * When using ifnet TLS, unencrypted TLS records are * sent down the stack to the NIC. */ if (tls->mode == TCP_TLS_MODE_SW) { m->m_flags |= M_NOTREADY; if (__predict_false(tls_len == 0)) { /* TLS 1.0 empty fragment. */ m->m_epg_nrdy = 1; } else m->m_epg_nrdy = m->m_epg_npgs; *enq_cnt += m->m_epg_nrdy; } } } void ktls_check_rx(struct sockbuf *sb) { struct tls_record_layer hdr; struct ktls_wq *wq; struct socket *so; bool running; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX", __func__, sb)); so = __containerof(sb, struct socket, so_rcv); if (sb->sb_flags & SB_TLS_RX_RUNNING) return; /* Is there enough queued for a TLS header? */ if (sb->sb_tlscc < sizeof(hdr)) { if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0) so->so_error = EMSGSIZE; return; } m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr); /* Is the entire record queued? */ if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) { if ((sb->sb_state & SBS_CANTRCVMORE) != 0) so->so_error = EMSGSIZE; return; } sb->sb_flags |= SB_TLS_RX_RUNNING; soref(so); wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_rx_queued, 1); } static struct mbuf * ktls_detach_record(struct sockbuf *sb, int len) { struct mbuf *m, *n, *top; int remain; SOCKBUF_LOCK_ASSERT(sb); MPASS(len <= sb->sb_tlscc); /* * If TLS chain is the exact size of the record, * just grab the whole record. */ top = sb->sb_mtls; if (sb->sb_tlscc == len) { sb->sb_mtls = NULL; sb->sb_mtlstail = NULL; goto out; } /* * While it would be nice to use m_split() here, we need * to know exactly what m_split() allocates to update the * accounting, so do it inline instead. */ remain = len; for (m = top; remain > m->m_len; m = m->m_next) remain -= m->m_len; /* Easy case: don't have to split 'm'. */ if (remain == m->m_len) { sb->sb_mtls = m->m_next; if (sb->sb_mtls == NULL) sb->sb_mtlstail = NULL; m->m_next = NULL; goto out; } /* * Need to allocate an mbuf to hold the remainder of 'm'. Try * with M_NOWAIT first. */ n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { /* * Use M_WAITOK with socket buffer unlocked. If * 'sb_mtls' changes while the lock is dropped, return * NULL to force the caller to retry. */ SOCKBUF_UNLOCK(sb); n = m_get(M_WAITOK, MT_DATA); SOCKBUF_LOCK(sb); if (sb->sb_mtls != top) { m_free(n); return (NULL); } } n->m_flags |= M_NOTREADY; /* Store remainder in 'n'. */ n->m_len = m->m_len - remain; if (m->m_flags & M_EXT) { n->m_data = m->m_data + remain; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len); } /* Trim 'm' and update accounting. */ m->m_len -= n->m_len; sb->sb_tlscc -= n->m_len; sb->sb_ccc -= n->m_len; /* Account for 'n'. */ sballoc_ktls_rx(sb, n); /* Insert 'n' into the TLS chain. */ sb->sb_mtls = n; n->m_next = m->m_next; if (sb->sb_mtlstail == m) sb->sb_mtlstail = n; /* Detach the record from the TLS chain. */ m->m_next = NULL; out: MPASS(m_length(top, NULL) == len); for (m = top; m != NULL; m = m->m_next) sbfree_ktls_rx(sb, m); sb->sb_tlsdcc = len; sb->sb_ccc += len; SBCHECK(sb); return (top); } static void ktls_decrypt(struct socket *so) { char tls_header[MBUF_PEXT_HDR_LEN]; struct ktls_session *tls; struct sockbuf *sb; struct tls_record_layer *hdr; struct tls_get_record tgr; struct mbuf *control, *data, *m; uint64_t seqno; int error, remain, tls_len, trail_len; hdr = (struct tls_record_layer *)tls_header; sb = &so->so_rcv; SOCKBUF_LOCK(sb); KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING, ("%s: socket %p not running", __func__, so)); tls = sb->sb_tls_info; MPASS(tls != NULL); for (;;) { /* Is there enough queued for a TLS header? */ if (sb->sb_tlscc < tls->params.tls_hlen) break; m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header); tls_len = sizeof(*hdr) + ntohs(hdr->tls_length); if (hdr->tls_vmajor != tls->params.tls_vmajor || hdr->tls_vminor != tls->params.tls_vminor) error = EINVAL; else if (tls_len < tls->params.tls_hlen || tls_len > tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 + tls->params.tls_tlen) error = EMSGSIZE; else error = 0; if (__predict_false(error != 0)) { /* * We have a corrupted record and are likely * out of sync. The connection isn't * recoverable at this point, so abort it. */ SOCKBUF_UNLOCK(sb); counter_u64_add(ktls_offload_corrupted_records, 1); CURVNET_SET(so->so_vnet); so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = error; CURVNET_RESTORE(); goto deref; } /* Is the entire record queued? */ if (sb->sb_tlscc < tls_len) break; /* * Split out the portion of the mbuf chain containing * this TLS record. */ data = ktls_detach_record(sb, tls_len); if (data == NULL) continue; MPASS(sb->sb_tlsdcc == tls_len); seqno = sb->sb_tls_seqno; sb->sb_tls_seqno++; SBCHECK(sb); SOCKBUF_UNLOCK(sb); error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); SOCKBUF_LOCK(sb); if (sb->sb_tlsdcc == 0) { /* * sbcut/drop/flush discarded these * mbufs. */ m_freem(data); break; } /* * Drop this TLS record's data, but keep * decrypting subsequent records. */ sb->sb_ccc -= tls_len; sb->sb_tlsdcc = 0; CURVNET_SET(so->so_vnet); so->so_error = EBADMSG; sorwakeup_locked(so); CURVNET_RESTORE(); m_freem(data); SOCKBUF_LOCK(sb); continue; } /* Allocate the control mbuf. */ tgr.tls_type = hdr->tls_type; tgr.tls_vmajor = hdr->tls_vmajor; tgr.tls_vminor = hdr->tls_vminor; tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen - trail_len); control = sbcreatecontrol_how(&tgr, sizeof(tgr), TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK); SOCKBUF_LOCK(sb); if (sb->sb_tlsdcc == 0) { /* sbcut/drop/flush discarded these mbufs. */ MPASS(sb->sb_tlscc == 0); m_freem(data); m_freem(control); break; } /* * Clear the 'dcc' accounting in preparation for * adding the decrypted record. */ sb->sb_ccc -= tls_len; sb->sb_tlsdcc = 0; SBCHECK(sb); /* If there is no payload, drop all of the data. */ if (tgr.tls_length == htobe16(0)) { m_freem(data); data = NULL; } else { /* Trim header. */ remain = tls->params.tls_hlen; while (remain > 0) { if (data->m_len > remain) { data->m_data += remain; data->m_len -= remain; break; } remain -= data->m_len; data = m_free(data); } /* Trim trailer and clear M_NOTREADY. */ remain = be16toh(tgr.tls_length); m = data; for (m = data; remain > m->m_len; m = m->m_next) { m->m_flags &= ~M_NOTREADY; remain -= m->m_len; } m->m_len = remain; m_freem(m->m_next); m->m_next = NULL; m->m_flags &= ~M_NOTREADY; /* Set EOR on the final mbuf. */ m->m_flags |= M_EOR; } sbappendcontrol_locked(sb, data, control, 0); } sb->sb_flags &= ~SB_TLS_RX_RUNNING; if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0) so->so_error = EMSGSIZE; sorwakeup_locked(so); deref: SOCKBUF_UNLOCK_ASSERT(sb); CURVNET_SET(so->so_vnet); SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } void ktls_enqueue_to_free(struct mbuf *m) { struct ktls_wq *wq; bool running; /* Mark it for freeing. */ m->m_epg_flags |= EPG_FLAG_2FREE; wq = &ktls_wq[m->m_epg_tls->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); } static void * ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m) { void *buf; int domain, running; if (m->m_epg_npgs <= 2) return (NULL); if (ktls_buffer_zone == NULL) return (NULL); if ((u_int)(ticks - wq->lastallocfail) < hz) { /* * Rate-limit allocation attempts after a failure. * ktls_buffer_import() will acquire a per-domain mutex to check * the free page queues and may fail consistently if memory is * fragmented. */ return (NULL); } buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM); if (buf == NULL) { domain = PCPU_GET(domain); wq->lastallocfail = ticks; /* * Note that this check is "racy", but the races are * harmless, and are either a spurious wakeup if * multiple threads fail allocations before the alloc * thread wakes, or waiting an extra second in case we * see an old value of running == true. */ if (!VM_DOMAIN_EMPTY(domain)) { running = atomic_load_int(&ktls_domains[domain].alloc_td.running); if (!running) wakeup(&ktls_domains[domain].alloc_td); } } return (buf); } static int ktls_encrypt_record(struct ktls_wq *wq, struct mbuf *m, struct ktls_session *tls, struct ktls_ocf_encrypt_state *state) { vm_page_t pg; int error, i, len, off; KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY), ("%p not unready & nomap mbuf\n", m)); KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen, ("page count %d larger than maximum frame length %d", m->m_epg_npgs, ktls_maxlen)); /* Anonymous mbufs are encrypted in place. */ if ((m->m_epg_flags & EPG_FLAG_ANON) != 0) return (tls->sw_encrypt(state, tls, m, NULL, 0)); /* * For file-backed mbufs (from sendfile), anonymous wired * pages are allocated and used as the encryption destination. */ if ((state->cbuf = ktls_buffer_alloc(wq, m)) != NULL) { len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len - m->m_epg_1st_off; state->dst_iov[0].iov_base = (char *)state->cbuf + m->m_epg_1st_off; state->dst_iov[0].iov_len = len; state->parray[0] = DMAP_TO_PHYS((vm_offset_t)state->cbuf); i = 1; } else { off = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs; i++, off = 0) { - do { - pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | - VM_ALLOC_WIRED | VM_ALLOC_WAITFAIL); - } while (pg == NULL); - + pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP | + VM_ALLOC_WIRED | VM_ALLOC_WAITOK); len = m_epg_pagelen(m, i, off); state->parray[i] = VM_PAGE_TO_PHYS(pg); state->dst_iov[i].iov_base = (char *)PHYS_TO_DMAP(state->parray[i]) + off; state->dst_iov[i].iov_len = len; } } KASSERT(i + 1 <= nitems(state->dst_iov), ("dst_iov is too small")); state->dst_iov[i].iov_base = m->m_epg_trail; state->dst_iov[i].iov_len = m->m_epg_trllen; error = tls->sw_encrypt(state, tls, m, state->dst_iov, i + 1); if (__predict_false(error != 0)) { /* Free the anonymous pages. */ if (state->cbuf != NULL) uma_zfree(ktls_buffer_zone, state->cbuf); else { for (i = 0; i < m->m_epg_npgs; i++) { pg = PHYS_TO_VM_PAGE(state->parray[i]); (void)vm_page_unwire_noq(pg); vm_page_free(pg); } } } return (error); } /* Number of TLS records in a batch passed to ktls_enqueue(). */ static u_int ktls_batched_records(struct mbuf *m) { int page_count, records; records = 0; page_count = m->m_epg_enc_cnt; while (page_count > 0) { records++; page_count -= m->m_epg_nrdy; m = m->m_next; } KASSERT(page_count == 0, ("%s: mismatched page count", __func__)); return (records); } void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count) { struct ktls_session *tls; struct ktls_wq *wq; int queued; bool running; KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY)), ("ktls_enqueue: %p not unready & nomap mbuf\n", m)); KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count")); KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf")); m->m_epg_enc_cnt = page_count; /* * Save a pointer to the socket. The caller is responsible * for taking an additional reference via soref(). */ m->m_epg_so = so; queued = 1; tls = m->m_epg_tls; wq = &ktls_wq[tls->wq_index]; mtx_lock(&wq->mtx); if (__predict_false(tls->sequential_records)) { /* * For TLS 1.0, records must be encrypted * sequentially. For a given connection, all records * queued to the associated work queue are processed * sequentially. However, sendfile(2) might complete * I/O requests spanning multiple TLS records out of * order. Here we ensure TLS records are enqueued to * the work queue in FIFO order. * * tls->next_seqno holds the sequence number of the * next TLS record that should be enqueued to the work * queue. If this next record is not tls->next_seqno, * it must be a future record, so insert it, sorted by * TLS sequence number, into tls->pending_records and * return. * * If this TLS record matches tls->next_seqno, place * it in the work queue and then check * tls->pending_records to see if any * previously-queued records are now ready for * encryption. */ if (m->m_epg_seqno != tls->next_seqno) { struct mbuf *n, *p; p = NULL; STAILQ_FOREACH(n, &tls->pending_records, m_epg_stailq) { if (n->m_epg_seqno > m->m_epg_seqno) break; p = n; } if (n == NULL) STAILQ_INSERT_TAIL(&tls->pending_records, m, m_epg_stailq); else if (p == NULL) STAILQ_INSERT_HEAD(&tls->pending_records, m, m_epg_stailq); else STAILQ_INSERT_AFTER(&tls->pending_records, p, m, m_epg_stailq); mtx_unlock(&wq->mtx); counter_u64_add(ktls_cnt_tx_pending, 1); return; } tls->next_seqno += ktls_batched_records(m); STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); while (!STAILQ_EMPTY(&tls->pending_records)) { struct mbuf *n; n = STAILQ_FIRST(&tls->pending_records); if (n->m_epg_seqno != tls->next_seqno) break; queued++; STAILQ_REMOVE_HEAD(&tls->pending_records, m_epg_stailq); tls->next_seqno += ktls_batched_records(n); STAILQ_INSERT_TAIL(&wq->m_head, n, m_epg_stailq); } counter_u64_add(ktls_cnt_tx_pending, -(queued - 1)); } else STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_tx_queued, queued); } /* * Once a file-backed mbuf (from sendfile) has been encrypted, free * the pages from the file and replace them with the anonymous pages * allocated in ktls_encrypt_record(). */ static void ktls_finish_nonanon(struct mbuf *m, struct ktls_ocf_encrypt_state *state) { int i; MPASS((m->m_epg_flags & EPG_FLAG_ANON) == 0); /* Free the old pages. */ m->m_ext.ext_free(m); /* Replace them with the new pages. */ if (state->cbuf != NULL) { for (i = 0; i < m->m_epg_npgs; i++) m->m_epg_pa[i] = state->parray[0] + ptoa(i); /* Contig pages should go back to the cache. */ m->m_ext.ext_free = ktls_free_mext_contig; } else { for (i = 0; i < m->m_epg_npgs; i++) m->m_epg_pa[i] = state->parray[i]; /* Use the basic free routine. */ m->m_ext.ext_free = mb_free_mext_pgs; } /* Pages are now writable. */ m->m_epg_flags |= EPG_FLAG_ANON; } static __noinline void ktls_encrypt(struct ktls_wq *wq, struct mbuf *top) { struct ktls_ocf_encrypt_state state; struct ktls_session *tls; struct socket *so; struct mbuf *m; int error, npages, total_pages; so = top->m_epg_so; tls = top->m_epg_tls; KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top)); KASSERT(so != NULL, ("so = NULL, top = %p\n", top)); #ifdef INVARIANTS top->m_epg_so = NULL; #endif total_pages = top->m_epg_enc_cnt; npages = 0; /* * Encrypt the TLS records in the chain of mbufs starting with * 'top'. 'total_pages' gives us a total count of pages and is * used to know when we have finished encrypting the TLS * records originally queued with 'top'. * * NB: These mbufs are queued in the socket buffer and * 'm_next' is traversing the mbufs in the socket buffer. The * socket buffer lock is not held while traversing this chain. * Since the mbufs are all marked M_NOTREADY their 'm_next' * pointers should be stable. However, the 'm_next' of the * last mbuf encrypted is not necessarily NULL. It can point * to other mbufs appended while 'top' was on the TLS work * queue. * * Each mbuf holds an entire TLS record. */ error = 0; for (m = top; npages != total_pages; m = m->m_next) { KASSERT(m->m_epg_tls == tls, ("different TLS sessions in a single mbuf chain: %p vs %p", tls, m->m_epg_tls)); KASSERT(npages + m->m_epg_npgs <= total_pages, ("page count mismatch: top %p, total_pages %d, m %p", top, total_pages, m)); error = ktls_encrypt_record(wq, m, tls, &state); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); break; } if ((m->m_epg_flags & EPG_FLAG_ANON) == 0) ktls_finish_nonanon(m, &state); npages += m->m_epg_nrdy; /* * Drop a reference to the session now that it is no * longer needed. Existing code depends on encrypted * records having no associated session vs * yet-to-be-encrypted records having an associated * session. */ m->m_epg_tls = NULL; ktls_free(tls); } CURVNET_SET(so->so_vnet); if (error == 0) { (void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages); } else { so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(top, total_pages); } SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } void ktls_encrypt_cb(struct ktls_ocf_encrypt_state *state, int error) { struct ktls_session *tls; struct socket *so; struct mbuf *m; int npages; m = state->m; if ((m->m_epg_flags & EPG_FLAG_ANON) == 0) ktls_finish_nonanon(m, state); so = state->so; free(state, M_KTLS); /* * Drop a reference to the session now that it is no longer * needed. Existing code depends on encrypted records having * no associated session vs yet-to-be-encrypted records having * an associated session. */ tls = m->m_epg_tls; m->m_epg_tls = NULL; ktls_free(tls); if (error != 0) counter_u64_add(ktls_offload_failed_crypto, 1); CURVNET_SET(so->so_vnet); npages = m->m_epg_nrdy; if (error == 0) { (void)(*so->so_proto->pr_usrreqs->pru_ready)(so, m, npages); } else { so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(m, npages); } SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } /* * Similar to ktls_encrypt, but used with asynchronous OCF backends * (coprocessors) where encryption does not use host CPU resources and * it can be beneficial to queue more requests than CPUs. */ static __noinline void ktls_encrypt_async(struct ktls_wq *wq, struct mbuf *top) { struct ktls_ocf_encrypt_state *state; struct ktls_session *tls; struct socket *so; struct mbuf *m, *n; int error, mpages, npages, total_pages; so = top->m_epg_so; tls = top->m_epg_tls; KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top)); KASSERT(so != NULL, ("so = NULL, top = %p\n", top)); #ifdef INVARIANTS top->m_epg_so = NULL; #endif total_pages = top->m_epg_enc_cnt; npages = 0; error = 0; for (m = top; npages != total_pages; m = n) { KASSERT(m->m_epg_tls == tls, ("different TLS sessions in a single mbuf chain: %p vs %p", tls, m->m_epg_tls)); KASSERT(npages + m->m_epg_npgs <= total_pages, ("page count mismatch: top %p, total_pages %d, m %p", top, total_pages, m)); state = malloc(sizeof(*state), M_KTLS, M_WAITOK | M_ZERO); soref(so); state->so = so; state->m = m; mpages = m->m_epg_nrdy; n = m->m_next; error = ktls_encrypt_record(wq, m, tls, state); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); free(state, M_KTLS); CURVNET_SET(so->so_vnet); SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); break; } npages += mpages; } CURVNET_SET(so->so_vnet); if (error != 0) { so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(m, total_pages - npages); } SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } static int ktls_bind_domain(int domain) { int error; error = cpuset_setthread(curthread->td_tid, &cpuset_domain[domain]); if (error != 0) return (error); curthread->td_domain.dr_policy = DOMAINSET_PREF(domain); return (0); } static void ktls_alloc_thread(void *ctx) { struct ktls_domain_info *ktls_domain = ctx; struct ktls_alloc_thread *sc = &ktls_domain->alloc_td; void **buf; struct sysctl_oid *oid; char name[80]; int domain, error, i, nbufs; domain = ktls_domain - ktls_domains; if (bootverbose) printf("Starting KTLS alloc thread for domain %d\n", domain); error = ktls_bind_domain(domain); if (error) printf("Unable to bind KTLS alloc thread for domain %d: error %d\n", domain, error); snprintf(name, sizeof(name), "domain%d", domain); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs", CTLFLAG_RD, &sc->allocs, 0, "buffers allocated"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups", CTLFLAG_RD, &sc->wakeups, 0, "thread wakeups"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running", CTLFLAG_RD, &sc->running, 0, "thread running"); buf = NULL; nbufs = 0; for (;;) { atomic_store_int(&sc->running, 0); tsleep(sc, PZERO | PNOLOCK, "-", 0); atomic_store_int(&sc->running, 1); sc->wakeups++; if (nbufs != ktls_max_alloc) { free(buf, M_KTLS); nbufs = atomic_load_int(&ktls_max_alloc); buf = malloc(sizeof(void *) * nbufs, M_KTLS, M_WAITOK | M_ZERO); } /* * Below we allocate nbufs with different allocation * flags than we use when allocating normally during * encryption in the ktls worker thread. We specify * M_NORECLAIM in the worker thread. However, we omit * that flag here and add M_WAITOK so that the VM * system is permitted to perform expensive work to * defragment memory. We do this here, as it does not * matter if this thread blocks. If we block a ktls * worker thread, we risk developing backlogs of * buffers to be encrypted, leading to surges of * traffic and potential NIC output drops. */ for (i = 0; i < nbufs; i++) { buf[i] = uma_zalloc(ktls_buffer_zone, M_WAITOK); sc->allocs++; } for (i = 0; i < nbufs; i++) { uma_zfree(ktls_buffer_zone, buf[i]); buf[i] = NULL; } } } static void ktls_work_thread(void *ctx) { struct ktls_wq *wq = ctx; struct mbuf *m, *n; struct socket *so, *son; STAILQ_HEAD(, mbuf) local_m_head; STAILQ_HEAD(, socket) local_so_head; int cpu; cpu = wq - ktls_wq; if (bootverbose) printf("Starting KTLS worker thread for CPU %d\n", cpu); /* * Bind to a core. If ktls_bind_threads is > 1, then * we bind to the NUMA domain instead. */ if (ktls_bind_threads) { int error; if (ktls_bind_threads > 1) { struct pcpu *pc = pcpu_find(cpu); error = ktls_bind_domain(pc->pc_domain); } else { cpuset_t mask; CPU_SETOF(cpu, &mask); error = cpuset_setthread(curthread->td_tid, &mask); } if (error) printf("Unable to bind KTLS worker thread for CPU %d: error %d\n", cpu, error); } #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) fpu_kern_thread(0); #endif for (;;) { mtx_lock(&wq->mtx); while (STAILQ_EMPTY(&wq->m_head) && STAILQ_EMPTY(&wq->so_head)) { wq->running = false; mtx_sleep(wq, &wq->mtx, 0, "-", 0); wq->running = true; } STAILQ_INIT(&local_m_head); STAILQ_CONCAT(&local_m_head, &wq->m_head); STAILQ_INIT(&local_so_head); STAILQ_CONCAT(&local_so_head, &wq->so_head); mtx_unlock(&wq->mtx); STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) { if (m->m_epg_flags & EPG_FLAG_2FREE) { ktls_free(m->m_epg_tls); m_free_raw(m); } else { if (m->m_epg_tls->sync_dispatch) ktls_encrypt(wq, m); else ktls_encrypt_async(wq, m); counter_u64_add(ktls_cnt_tx_queued, -1); } } STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) { ktls_decrypt(so); counter_u64_add(ktls_cnt_rx_queued, -1); } } } #if defined(INET) || defined(INET6) static void ktls_disable_ifnet_help(void *context, int pending __unused) { struct ktls_session *tls; struct inpcb *inp; struct tcpcb *tp; struct socket *so; int err; tls = context; inp = tls->inp; if (inp == NULL) return; INP_WLOCK(inp); so = inp->inp_socket; MPASS(so != NULL); if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { goto out; } if (so->so_snd.sb_tls_info != NULL) err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW); else err = ENXIO; if (err == 0) { counter_u64_add(ktls_ifnet_disable_ok, 1); /* ktls_set_tx_mode() drops inp wlock, so recheck flags */ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 && (inp->inp_flags2 & INP_FREED) == 0 && (tp = intotcpcb(inp)) != NULL && tp->t_fb->tfb_hwtls_change != NULL) (*tp->t_fb->tfb_hwtls_change)(tp, 0); } else { counter_u64_add(ktls_ifnet_disable_fail, 1); } out: SOCK_LOCK(so); sorele(so); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); ktls_free(tls); } /* * Called when re-transmits are becoming a substantial portion of the * sends on this connection. When this happens, we transition the * connection to software TLS. This is needed because most inline TLS * NICs keep crypto state only for in-order transmits. This means * that to handle a TCP rexmit (which is out-of-order), the NIC must * re-DMA the entire TLS record up to and including the current * segment. This means that when re-transmitting the last ~1448 byte * segment of a 16KB TLS record, we could wind up re-DMA'ing an order * of magnitude more data than we are sending. This can cause the * PCIe link to saturate well before the network, which can cause * output drops, and a general loss of capacity. */ void ktls_disable_ifnet(void *arg) { struct tcpcb *tp; struct inpcb *inp; struct socket *so; struct ktls_session *tls; tp = arg; inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); so = inp->inp_socket; SOCK_LOCK(so); tls = so->so_snd.sb_tls_info; if (tls->disable_ifnet_pending) { SOCK_UNLOCK(so); return; } /* * note that disable_ifnet_pending is never cleared; disabling * ifnet can only be done once per session, so we never want * to do it again */ (void)ktls_hold(tls); in_pcbref(inp); soref(so); tls->disable_ifnet_pending = true; tls->inp = inp; SOCK_UNLOCK(so); TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls); (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task); } #endif diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c index 0f1a0c5e4e0c..e0793a16d76f 100644 --- a/sys/kern/uipc_mbuf.c +++ b/sys/kern/uipc_mbuf.c @@ -1,2272 +1,2271 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include "opt_mbuf_stress_test.h" #include "opt_mbuf_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "uint32_t", "uint32_t"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr_raw, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get_raw, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE5_XLATE(sdt, , , m__getjcl, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "uint32_t", "uint32_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint32_t", "uint32_t"); SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint32_t", "uint32_t", "void*", "void*"); SDT_PROBE_DEFINE(sdt, , , m__cljset); SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free, "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem, "struct mbuf *", "mbufinfo_t *"); #include int max_linkhdr; int max_protohdr; int max_hdr; int max_datalen; #ifdef MBUF_STRESS_TEST int m_defragpackets; int m_defragbytes; int m_defraguseless; int m_defragfailure; int m_defragrandomfailures; #endif /* * sysctl(8) exported objects */ SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD, &max_linkhdr, 0, "Size of largest link layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD, &max_protohdr, 0, "Size of largest protocol layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD, &max_hdr, 0, "Size of largest link plus protocol header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD, &max_datalen, 0, "Minimum space left in mbuf after max_hdr"); #ifdef MBUF_STRESS_TEST SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, &m_defragpackets, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, &m_defragbytes, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, &m_defraguseless, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, &m_defragfailure, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, &m_defragrandomfailures, 0, ""); #endif /* * Ensure the correct size of various mbuf parameters. It could be off due * to compiler-induced padding and alignment artifacts. */ CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN); CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN); /* * mbuf data storage should be 64-bit aligned regardless of architectural * pointer size; check this is the case with and without a packet header. */ CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0); CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0); /* * While the specific values here don't matter too much (i.e., +/- a few * words), we do want to ensure that changes to these values are carefully * reasoned about and properly documented. This is especially the case as * network-protocol and device-driver modules encode these layouts, and must * be recompiled if the structures change. Check these values at compile time * against the ones documented in comments in mbuf.h. * * NB: Possibly they should be documented there via #define's and not just * comments. */ #if defined(__LP64__) CTASSERT(offsetof(struct mbuf, m_dat) == 32); CTASSERT(sizeof(struct pkthdr) == 56); CTASSERT(sizeof(struct m_ext) == 160); #else CTASSERT(offsetof(struct mbuf, m_dat) == 24); CTASSERT(sizeof(struct pkthdr) == 48); #if defined(__powerpc__) && defined(BOOKE) /* PowerPC booke has 64-bit physical pointers. */ CTASSERT(sizeof(struct m_ext) == 184); #else CTASSERT(sizeof(struct m_ext) == 180); #endif #endif /* * Assert that the queue(3) macros produce code of the same size as an old * plain pointer does. */ #ifdef INVARIANTS static struct mbuf __used m_assertbuf; CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt)); CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt)); #endif /* * Attach the cluster from *m to *n, set up m_ext in *n * and bump the refcount of the cluster. */ void mb_dupcl(struct mbuf *n, struct mbuf *m) { volatile u_int *refcnt; KASSERT(m->m_flags & (M_EXT|M_EXTPG), ("%s: M_EXT|M_EXTPG not set on %p", __func__, m)); KASSERT(!(n->m_flags & (M_EXT|M_EXTPG)), ("%s: M_EXT|M_EXTPG set on %p", __func__, n)); /* * Cache access optimization. * * o Regular M_EXT storage doesn't need full copy of m_ext, since * the holder of the 'ext_count' is responsible to carry the free * routine and its arguments. * o M_EXTPG data is split between main part of mbuf and m_ext, the * main part is copied in full, the m_ext part is similar to M_EXT. * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is * special - it needs full copy of m_ext into each mbuf, since any * copy could end up as the last to free. */ if (m->m_flags & M_EXTPG) { bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy, __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy)); bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen); } else if (m->m_ext.ext_type == EXT_EXTREF) bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext)); else bcopy(&m->m_ext, &n->m_ext, m_ext_copylen); n->m_flags |= m->m_flags & (M_RDONLY | M_EXT | M_EXTPG); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count; n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; } if (*refcnt == 1) *refcnt += 1; else atomic_add_int(refcnt, 1); } void m_demote_pkthdr(struct mbuf *m) { M_ASSERTPKTHDR(m); m_tag_delete_chain(m, NULL); m->m_flags &= ~M_PKTHDR; bzero(&m->m_pkthdr, sizeof(struct pkthdr)); } /* * Clean up mbuf (chain) from any tags and packet headers. * If "all" is set then the first mbuf in the chain will be * cleaned too. */ void m_demote(struct mbuf *m0, int all, int flags) { struct mbuf *m; for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p", __func__, m, m0)); if (m->m_flags & M_PKTHDR) m_demote_pkthdr(m); m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | M_EXTPG | flags); } } /* * Sanity checks on mbuf (chain) for use in KASSERT() and general * debugging. * Returns 0 or panics when bad and 1 on all tests passed. * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they * blow up later. */ int m_sanity(struct mbuf *m0, int sanitize) { struct mbuf *m; caddr_t a, b; int pktlen = 0; #ifdef INVARIANTS #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m) #else #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m) #endif for (m = m0; m != NULL; m = m->m_next) { /* * Basic pointer checks. If any of these fails then some * unrelated kernel memory before or after us is trashed. * No way to recover from that. */ a = M_START(m); b = a + M_SIZE(m); if ((caddr_t)m->m_data < a) M_SANITY_ACTION("m_data outside mbuf data range left"); if ((caddr_t)m->m_data > b) M_SANITY_ACTION("m_data outside mbuf data range right"); if ((caddr_t)m->m_data + m->m_len > b) M_SANITY_ACTION("m_data + m_len exeeds mbuf space"); /* m->m_nextpkt may only be set on first mbuf in chain. */ if (m != m0 && m->m_nextpkt != NULL) { if (sanitize) { m_freem(m->m_nextpkt); m->m_nextpkt = (struct mbuf *)0xDEADC0DE; } else M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf"); } /* packet length (not mbuf length!) calculation */ if (m0->m_flags & M_PKTHDR) pktlen += m->m_len; /* m_tags may only be attached to first mbuf in chain. */ if (m != m0 && m->m_flags & M_PKTHDR && !SLIST_EMPTY(&m->m_pkthdr.tags)) { if (sanitize) { m_tag_delete_chain(m, NULL); /* put in 0xDEADC0DE perhaps? */ } else M_SANITY_ACTION("m_tags on in-chain mbuf"); } /* M_PKTHDR may only be set on first mbuf in chain */ if (m != m0 && m->m_flags & M_PKTHDR) { if (sanitize) { bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); m->m_flags &= ~M_PKTHDR; /* put in 0xDEADCODE and leave hdr flag in */ } else M_SANITY_ACTION("M_PKTHDR on in-chain mbuf"); } } m = m0; if (pktlen && pktlen != m->m_pkthdr.len) { if (sanitize) m->m_pkthdr.len = 0; else M_SANITY_ACTION("m_pkthdr.len != mbuf chain length"); } return 1; #undef M_SANITY_ACTION } /* * Non-inlined part of m_init(). */ int m_pkthdr_init(struct mbuf *m, int how) { #ifdef MAC int error; #endif m->m_data = m->m_pktdat; bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); #ifdef NUMA m->m_pkthdr.numa_domain = M_NODOM; #endif #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); if (error) return (error); #endif return (0); } /* * "Move" mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. */ void m_move_pkthdr(struct mbuf *to, struct mbuf *from) { #if 0 /* see below for why these are not enabled */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_move_pkthdr: to has tags")); #endif #ifdef MAC /* * XXXMAC: It could be this should also occur for non-MAC? */ if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & (M_EXT | M_EXTPG)); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) { from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; from->m_pkthdr.snd_tag = NULL; } } /* * Duplicate "from"'s mbuf pkthdr in "to". * "from" must have M_PKTHDR set, and "to" must be empty. * In particular, this does a deep copy of the packet tags. */ int m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) { #if 0 /* * The mbuf allocator only initializes the pkthdr * when the mbuf is allocated with m_gethdr(). Many users * (e.g. m_copy*, m_prepend) use m_get() and then * smash the pkthdr as needed causing these * assertions to trip. For now just disable them. */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); #endif MBUF_CHECKSLEEP(how); #ifdef MAC if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & (M_EXT | M_EXTPG)); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) m_snd_tag_ref(from->m_pkthdr.snd_tag); SLIST_INIT(&to->m_pkthdr.tags); return (m_tag_copy_chain(to, from, how)); } /* * Lesser-used path for M_PREPEND: * allocate new mbuf to prepend to chain, * copy junk along. */ struct mbuf * m_prepend(struct mbuf *m, int len, int how) { struct mbuf *mn; if (m->m_flags & M_PKTHDR) mn = m_gethdr(how, m->m_type); else mn = m_get(how, m->m_type); if (mn == NULL) { m_freem(m); return (NULL); } if (m->m_flags & M_PKTHDR) m_move_pkthdr(mn, m); mn->m_next = m; m = mn; if (len < M_SIZE(m)) M_ALIGN(m, len); m->m_len = len; return (m); } /* * Make a copy of an mbuf chain starting "off0" bytes from the beginning, * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. */ struct mbuf * m_copym(struct mbuf *m, int off0, int len, int wait) { struct mbuf *n, **np; int off = off0; struct mbuf *top; int copyhdr = 0; KASSERT(off >= 0, ("m_copym, negative off %d", off)); KASSERT(len >= 0, ("m_copym, negative len %d", len)); MBUF_CHECKSLEEP(wait); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = 1; while (off > 0) { KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } np = ⊤ top = NULL; while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("m_copym, length > size of mbuf chain")); break; } if (copyhdr) n = m_gethdr(wait, m->m_type); else n = m_get(wait, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, wait)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; copyhdr = 0; } n->m_len = min(len, m->m_len - off); if (m->m_flags & (M_EXT|M_EXTPG)) { n->m_data = m->m_data + off; mb_dupcl(n, m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (len != M_COPYALL) len -= n->m_len; off = 0; m = m->m_next; np = &n->m_next; } return (top); nospace: m_freem(top); return (NULL); } /* * Copy an entire packet, including header (which must be present). * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. * Preserve alignment of the first mbuf so if the creator has left * some room at the beginning (e.g. for inserting protocol headers) * the copies still have the room available. */ struct mbuf * m_copypacket(struct mbuf *m, int how) { struct mbuf *top, *n, *o; MBUF_CHECKSLEEP(how); n = m_get(how, m->m_type); top = n; if (n == NULL) goto nospace; if (!m_dup_pkthdr(n, m, how)) goto nospace; n->m_len = m->m_len; if (m->m_flags & (M_EXT|M_EXTPG)) { n->m_data = m->m_data; mb_dupcl(n, m); } else { n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; while (m) { o = m_get(how, m->m_type); if (o == NULL) goto nospace; n->m_next = o; n = n->m_next; n->m_len = m->m_len; if (m->m_flags & (M_EXT|M_EXTPG)) { n->m_data = m->m_data; mb_dupcl(n, m); } else { bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; } return top; nospace: m_freem(top); return (NULL); } static void m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp) { struct iovec iov; struct uio uio; int error; KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off)); KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len)); KASSERT(off < m->m_len, ("m_copyfromunmapped: len exceeds mbuf length")); iov.iov_base = cp; iov.iov_len = len; uio.uio_resid = len; uio.uio_iov = &iov; uio.uio_segflg = UIO_SYSSPACE; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_rw = UIO_READ; error = m_unmapped_uiomove(m, off, &uio, len); KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off, len)); } /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ void m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) { u_int count; KASSERT(off >= 0, ("m_copydata, negative off %d", off)); KASSERT(len >= 0, ("m_copydata, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); if ((m->m_flags & M_EXTPG) != 0) m_copyfromunmapped(m, off, count, cp); else bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; m = m->m_next; } } /* * Copy a packet header mbuf chain into a completely new chain, including * copying any mbuf clusters. Use this instead of m_copypacket() when * you need a writable copy of an mbuf chain. */ struct mbuf * m_dup(const struct mbuf *m, int how) { struct mbuf **p, *top = NULL; int remain, moff, nsize; MBUF_CHECKSLEEP(how); /* Sanity check */ if (m == NULL) return (NULL); M_ASSERTPKTHDR(m); /* While there's more data, get a new mbuf, tack it on, and fill it */ remain = m->m_pkthdr.len; moff = 0; p = ⊤ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ struct mbuf *n; /* Get the next new mbuf */ if (remain >= MINCLSIZE) { n = m_getcl(how, m->m_type, 0); nsize = MCLBYTES; } else { n = m_get(how, m->m_type); nsize = MLEN; } if (n == NULL) goto nospace; if (top == NULL) { /* First one, must be PKTHDR */ if (!m_dup_pkthdr(n, m, how)) { m_free(n); goto nospace; } if ((n->m_flags & M_EXT) == 0) nsize = MHLEN; n->m_flags &= ~M_RDONLY; } n->m_len = 0; /* Link it into the new chain */ *p = n; p = &n->m_next; /* Copy data from original mbuf(s) into new mbuf */ while (n->m_len < nsize && m != NULL) { int chunk = min(nsize - n->m_len, m->m_len - moff); m_copydata(m, moff, chunk, n->m_data + n->m_len); moff += chunk; n->m_len += chunk; remain -= chunk; if (moff == m->m_len) { m = m->m_next; moff = 0; } } /* Check correct total mbuf length */ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), ("%s: bogus m_pkthdr.len", __func__)); } return (top); nospace: m_freem(top); return (NULL); } /* * Concatenate mbuf chain n to m. * Both chains must be of the same type (e.g. MT_DATA). * Any m_pkthdr is not updated. */ void m_cat(struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; while (n) { if (!M_WRITABLE(m) || (n->m_flags & M_EXTPG) != 0 || M_TRAILINGSPACE(m) < n->m_len) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); } } /* * Concatenate two pkthdr mbuf chains. */ void m_catpkt(struct mbuf *m, struct mbuf *n) { M_ASSERTPKTHDR(m); M_ASSERTPKTHDR(n); m->m_pkthdr.len += n->m_pkthdr.len; m_demote(n, 1, 0); m_cat(m, n); } void m_adj(struct mbuf *mp, int req_len) { int len = req_len; struct mbuf *m; int count; if ((m = mp) == NULL) return; if (len >= 0) { /* * Trim from head. */ while (m != NULL && len > 0) { if (m->m_len <= len) { len -= m->m_len; m->m_len = 0; m = m->m_next; } else { m->m_len -= len; m->m_data += len; len = 0; } } if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= (req_len - len); } else { /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ len = -len; count = 0; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len >= len) { m->m_len -= len; if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len; return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ m = mp; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len = count; for (; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (m->m_next != NULL) { m_freem(m->m_next); m->m_next = NULL; } break; } count -= m->m_len; } } } void m_adj_decap(struct mbuf *mp, int len) { uint8_t rsstype; m_adj(mp, len); if ((mp->m_flags & M_PKTHDR) != 0) { /* * If flowid was calculated by card from the inner * headers, move flowid to the decapsulated mbuf * chain, otherwise clear. This depends on the * internals of m_adj, which keeps pkthdr as is, in * particular not changing rsstype and flowid. */ rsstype = mp->m_pkthdr.rsstype; if ((rsstype & M_HASHTYPE_INNER) != 0) { M_HASHTYPE_SET(mp, rsstype & ~M_HASHTYPE_INNER); } else { M_HASHTYPE_CLEAR(mp); } } } /* * Rearange an mbuf chain so that len bytes are contiguous * and in the data area of an mbuf (so that mtod will work * for a structure of size len). Returns the resulting * mbuf chain on success, frees it and returns null on failure. * If there is room, it will add up to max_protohdr-len extra bytes to the * contiguous region in an attempt to avoid being called next time. */ struct mbuf * m_pullup(struct mbuf *n, int len) { struct mbuf *m; int count; int space; KASSERT((n->m_flags & M_EXTPG) == 0, ("%s: unmapped mbuf %p", __func__, n)); /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && n->m_data + len < &n->m_dat[MLEN] && n->m_next) { if (n->m_len >= len) return (n); m = n; n = n->m_next; len -= m->m_len; } else { if (len > MHLEN) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Like m_pullup(), except a new mbuf is always allocated, and we allow * the amount of empty space before the data in the new mbuf to be specified * (in the event that the caller expects to prepend later). */ struct mbuf * m_copyup(struct mbuf *n, int len, int dstoff) { struct mbuf *m; int count, space; if (len > (MHLEN - dstoff)) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); m->m_data += dstoff; space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), (unsigned)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and * attempts to restore the chain to its original state. * * Note that the resulting mbufs might be read-only, because the new * mbuf can end up sharing an mbuf cluster with the original mbuf if * the "breaking point" happens to lie within a cluster mbuf. Use the * M_WRITABLE() macro to check for this case. */ struct mbuf * m_split(struct mbuf *m0, int len0, int wait) { struct mbuf *m, *n; u_int len = len0, remain; MBUF_CHECKSLEEP(wait); for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == NULL) return (NULL); remain = m->m_len - len; if (m0->m_flags & M_PKTHDR && remain == 0) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); n->m_next = m->m_next; m->m_next = NULL; if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { n->m_pkthdr.snd_tag = m_snd_tag_ref(m0->m_pkthdr.snd_tag); n->m_pkthdr.csum_flags |= CSUM_SND_TAG; } else n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; return (n); } else if (m0->m_flags & M_PKTHDR) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { n->m_pkthdr.snd_tag = m_snd_tag_ref(m0->m_pkthdr.snd_tag); n->m_pkthdr.csum_flags |= CSUM_SND_TAG; } else n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & (M_EXT|M_EXTPG)) goto extpacket; if (remain > MHLEN) { /* m can't be the lead packet */ M_ALIGN(n, 0); n->m_next = m_split(m, len, wait); if (n->m_next == NULL) { (void) m_free(n); return (NULL); } else { n->m_len = 0; return (n); } } else M_ALIGN(n, remain); } else if (remain == 0) { n = m->m_next; m->m_next = NULL; return (n); } else { n = m_get(wait, m->m_type); if (n == NULL) return (NULL); M_ALIGN(n, remain); } extpacket: if (m->m_flags & (M_EXT|M_EXTPG)) { n->m_data = m->m_data + len; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; m->m_next = NULL; return (n); } /* * Routine to copy from device local memory into mbufs. * Note that `off' argument is offset into first mbuf of target chain from * which to begin copying the data to. */ struct mbuf * m_devget(char *buf, int totlen, int off, struct ifnet *ifp, void (*copy)(char *from, caddr_t to, u_int len)) { struct mbuf *m; struct mbuf *top = NULL, **mp = ⊤ int len; if (off < 0 || off > MHLEN) return (NULL); while (totlen > 0) { if (top == NULL) { /* First one, must be PKTHDR */ if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); len = MCLBYTES; } else { m = m_gethdr(M_NOWAIT, MT_DATA); len = MHLEN; /* Place initial small packet/header at end of mbuf */ if (m && totlen + off + max_linkhdr <= MHLEN) { m->m_data += max_linkhdr; len -= max_linkhdr; } } if (m == NULL) return NULL; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = totlen; } else { if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, 0); len = MCLBYTES; } else { m = m_get(M_NOWAIT, MT_DATA); len = MLEN; } if (m == NULL) { m_freem(top); return NULL; } } if (off) { m->m_data += off; len -= off; off = 0; } m->m_len = len = min(totlen, len); if (copy) copy(buf, mtod(m, caddr_t), (u_int)len); else bcopy(buf, mtod(m, caddr_t), (u_int)len); buf += len; *mp = m; mp = &m->m_next; totlen -= len; } return (top); } static void m_copytounmapped(const struct mbuf *m, int off, int len, c_caddr_t cp) { struct iovec iov; struct uio uio; int error; KASSERT(off >= 0, ("m_copytounmapped: negative off %d", off)); KASSERT(len >= 0, ("m_copytounmapped: negative len %d", len)); KASSERT(off < m->m_len, ("m_copytounmapped: len exceeds mbuf length")); iov.iov_base = __DECONST(caddr_t, cp); iov.iov_len = len; uio.uio_resid = len; uio.uio_iov = &iov; uio.uio_segflg = UIO_SYSSPACE; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_rw = UIO_WRITE; error = m_unmapped_uiomove(m, off, &uio, len); KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off, len)); } /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) { int mlen; struct mbuf *m = m0, *n; int totlen = 0; if (m0 == NULL) return; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) goto out; bzero(mtod(n, caddr_t), MLEN); n->m_len = min(MLEN, len + off); m->m_next = n; } m = m->m_next; } while (len > 0) { if (m->m_next == NULL && (len > m->m_len - off)) { m->m_len += min(len - (m->m_len - off), M_TRAILINGSPACE(m)); } mlen = min (m->m_len - off, len); if ((m->m_flags & M_EXTPG) != 0) m_copytounmapped(m, off, mlen, cp); else bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); cp += mlen; len -= mlen; mlen += off; off = 0; totlen += mlen; if (len == 0) break; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, len); m->m_next = n; } m = m->m_next; } out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * Return 1 if able to complete the job; otherwise 0. */ int m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space, remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len, remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } static int m_apply_extpg_one(struct mbuf *m, int off, int len, int (*f)(void *, void *, u_int), void *arg) { void *p; u_int i, count, pgoff, pglen; int rval; KASSERT(PMAP_HAS_DMAP, ("m_apply_extpg_one does not support unmapped mbufs")); off += mtod(m, vm_offset_t); if (off < m->m_epg_hdrlen) { count = min(m->m_epg_hdrlen - off, len); rval = f(arg, m->m_epg_hdr + off, count); if (rval) return (rval); len -= count; off = 0; } else off -= m->m_epg_hdrlen; pgoff = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs && len > 0; i++) { pglen = m_epg_pagelen(m, i, pgoff); if (off < pglen) { count = min(pglen - off, len); p = (void *)PHYS_TO_DMAP(m->m_epg_pa[i] + pgoff); rval = f(arg, p, count); if (rval) return (rval); len -= count; off = 0; } else off -= pglen; pgoff = 0; } if (len > 0) { KASSERT(off < m->m_epg_trllen, ("m_apply_extpg_one: offset beyond trailer")); KASSERT(len <= m->m_epg_trllen - off, ("m_apply_extpg_one: length beyond trailer")); return (f(arg, m->m_epg_trail + off, len)); } return (0); } /* Apply function f to the data in a single mbuf. */ static int m_apply_one(struct mbuf *m, int off, int len, int (*f)(void *, void *, u_int), void *arg) { if ((m->m_flags & M_EXTPG) != 0) return (m_apply_extpg_one(m, off, len, f, arg)); else return (f(arg, mtod(m, caddr_t) + off, len)); } /* * Apply function f to the data in an mbuf chain starting "off" bytes from * the beginning, continuing for "len" bytes. */ int m_apply(struct mbuf *m, int off, int len, int (*f)(void *, void *, u_int), void *arg) { u_int count; int rval; KASSERT(off >= 0, ("m_apply, negative off %d", off)); KASSERT(len >= 0, ("m_apply, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); count = min(m->m_len - off, len); rval = m_apply_one(m, off, count, f, arg); if (rval) return (rval); len -= count; off = 0; m = m->m_next; } return (0); } /* * Return a pointer to mbuf/offset of location in mbuf chain. */ struct mbuf * m_getptr(struct mbuf *m, int loc, int *off) { while (loc >= 0) { /* Normal end of search. */ if (m->m_len > loc) { *off = loc; return (m); } else { loc -= m->m_len; if (m->m_next == NULL) { if (loc == 0) { /* Point at the end of valid data. */ *off = m->m_len; return (m); } return (NULL); } m = m->m_next; } } return (NULL); } void m_print(const struct mbuf *m, int maxlen) { int len; int pdata; const struct mbuf *m2; if (m == NULL) { printf("mbuf: %p\n", m); return; } if (m->m_flags & M_PKTHDR) len = m->m_pkthdr.len; else len = -1; m2 = m; while (m2 != NULL && (len == -1 || len)) { pdata = m2->m_len; if (maxlen != -1 && pdata > maxlen) pdata = maxlen; printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len, m2->m_next, m2->m_flags, "\20\20freelist\17skipfw" "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly" "\3eor\2pkthdr\1ext", pdata ? "" : "\n"); if (pdata) printf(", %*D\n", pdata, (u_char *)m2->m_data, "-"); if (len != -1) len -= m2->m_len; m2 = m2->m_next; } if (len > 0) printf("%d bytes unaccounted for.\n", len); return; } u_int m_fixhdr(struct mbuf *m0) { u_int len; len = m_length(m0, NULL); m0->m_pkthdr.len = len; return (len); } u_int m_length(struct mbuf *m0, struct mbuf **last) { struct mbuf *m; u_int len; len = 0; for (m = m0; m != NULL; m = m->m_next) { len += m->m_len; if (m->m_next == NULL) break; } if (last != NULL) *last = m; return (len); } /* * Defragment a mbuf chain, returning the shortest possible * chain of mbufs and clusters. If allocation fails and * this cannot be completed, NULL will be returned, but * the passed in chain will be unchanged. Upon success, * the original chain will be freed, and the new chain * will be returned. * * If a non-packet header is passed in, the original * mbuf (chain?) will be returned unharmed. */ struct mbuf * m_defrag(struct mbuf *m0, int how) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0, length; MBUF_CHECKSLEEP(how); if (!(m0->m_flags & M_PKTHDR)) return (m0); m_fixhdr(m0); /* Needed sanity check */ #ifdef MBUF_STRESS_TEST if (m_defragrandomfailures) { int temp = arc4random() & 0xff; if (temp == 0xba) goto nospace; } #endif if (m0->m_pkthdr.len > MHLEN) m_final = m_getcl(how, MT_DATA, M_PKTHDR); else m_final = m_gethdr(how, MT_DATA); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == 0) goto nospace; m_new = m_final; while (progress < m0->m_pkthdr.len) { length = m0->m_pkthdr.len - progress; if (length > MCLBYTES) length = MCLBYTES; if (m_new == NULL) { if (length > MLEN) m_new = m_getcl(how, MT_DATA, 0); else m_new = m_get(how, MT_DATA); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, length, mtod(m_new, caddr_t)); progress += length; m_new->m_len = length; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } #ifdef MBUF_STRESS_TEST if (m0->m_next == NULL) m_defraguseless++; #endif m_freem(m0); m0 = m_final; #ifdef MBUF_STRESS_TEST m_defragpackets++; m_defragbytes += m0->m_pkthdr.len; #endif return (m0); nospace: #ifdef MBUF_STRESS_TEST m_defragfailure++; #endif if (m_final) m_freem(m_final); return (NULL); } /* * Return the number of fragments an mbuf will use. This is usually * used as a proxy for the number of scatter/gather elements needed by * a DMA engine to access an mbuf. In general mapped mbufs are * assumed to be backed by physically contiguous buffers that only * need a single fragment. Unmapped mbufs, on the other hand, can * span disjoint physical pages. */ static int frags_per_mbuf(struct mbuf *m) { int frags; if ((m->m_flags & M_EXTPG) == 0) return (1); /* * The header and trailer are counted as a single fragment * each when present. * * XXX: This overestimates the number of fragments by assuming * all the backing physical pages are disjoint. */ frags = 0; if (m->m_epg_hdrlen != 0) frags++; frags += m->m_epg_npgs; if (m->m_epg_trllen != 0) frags++; return (frags); } /* * Defragment an mbuf chain, returning at most maxfrags separate * mbufs+clusters. If this is not possible NULL is returned and * the original mbuf chain is left in its present (potentially * modified) state. We use two techniques: collapsing consecutive * mbufs and replacing consecutive mbufs by a cluster. * * NB: this should really be named m_defrag but that name is taken */ struct mbuf * m_collapse(struct mbuf *m0, int how, int maxfrags) { struct mbuf *m, *n, *n2, **prev; u_int curfrags; /* * Calculate the current number of frags. */ curfrags = 0; for (m = m0; m != NULL; m = m->m_next) curfrags += frags_per_mbuf(m); /* * First, try to collapse mbufs. Note that we always collapse * towards the front so we don't need to deal with moving the * pkthdr. This may be suboptimal if the first mbuf has much * less data than the following. */ m = m0; again: for (;;) { n = m->m_next; if (n == NULL) break; if (M_WRITABLE(m) && n->m_len < M_TRAILINGSPACE(m)) { m_copydata(n, 0, n->m_len, mtod(m, char *) + m->m_len); m->m_len += n->m_len; m->m_next = n->m_next; curfrags -= frags_per_mbuf(n); m_free(n); if (curfrags <= maxfrags) return m0; } else m = n; } KASSERT(maxfrags > 1, ("maxfrags %u, but normal collapse failed", maxfrags)); /* * Collapse consecutive mbufs to a cluster. */ prev = &m0->m_next; /* NB: not the first mbuf */ while ((n = *prev) != NULL) { if ((n2 = n->m_next) != NULL && n->m_len + n2->m_len < MCLBYTES) { m = m_getcl(how, MT_DATA, 0); if (m == NULL) goto bad; m_copydata(n, 0, n->m_len, mtod(m, char *)); m_copydata(n2, 0, n2->m_len, mtod(m, char *) + n->m_len); m->m_len = n->m_len + n2->m_len; m->m_next = n2->m_next; *prev = m; curfrags += 1; /* For the new cluster */ curfrags -= frags_per_mbuf(n); curfrags -= frags_per_mbuf(n2); m_free(n); m_free(n2); if (curfrags <= maxfrags) return m0; /* * Still not there, try the normal collapse * again before we allocate another cluster. */ goto again; } prev = &n->m_next; } /* * No place where we can collapse to a cluster; punt. * This can occur if, for example, you request 2 frags * but the packet requires that both be clusters (we * never reallocate the first mbuf to avoid moving the * packet header). */ bad: return NULL; } #ifdef MBUF_STRESS_TEST /* * Fragment an mbuf chain. There's no reason you'd ever want to do * this in normal usage, but it's great for stress testing various * mbuf consumers. * * If fragmentation is not possible, the original chain will be * returned. * * Possible length values: * 0 no fragmentation will occur * > 0 each fragment will be of the specified length * -1 each fragment will be the same random value in length * -2 each fragment's length will be entirely random * (Random values range from 1 to 256) */ struct mbuf * m_fragment(struct mbuf *m0, int how, int length) { struct mbuf *m_first, *m_last; int divisor = 255, progress = 0, fraglen; if (!(m0->m_flags & M_PKTHDR)) return (m0); if (length == 0 || length < -2) return (m0); if (length > MCLBYTES) length = MCLBYTES; if (length < 0 && divisor > MCLBYTES) divisor = MCLBYTES; if (length == -1) length = 1 + (arc4random() % divisor); if (length > 0) fraglen = length; m_fixhdr(m0); /* Needed sanity check */ m_first = m_getcl(how, MT_DATA, M_PKTHDR); if (m_first == NULL) goto nospace; if (m_dup_pkthdr(m_first, m0, how) == 0) goto nospace; m_last = m_first; while (progress < m0->m_pkthdr.len) { if (length == -2) fraglen = 1 + (arc4random() % divisor); if (fraglen > m0->m_pkthdr.len - progress) fraglen = m0->m_pkthdr.len - progress; if (progress != 0) { struct mbuf *m_new = m_getcl(how, MT_DATA, 0); if (m_new == NULL) goto nospace; m_last->m_next = m_new; m_last = m_new; } m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t)); progress += fraglen; m_last->m_len = fraglen; } m_freem(m0); m0 = m_first; return (m0); nospace: if (m_first) m_freem(m_first); /* Return the original chain on failure */ return (m0); } #endif /* * Free pages from mbuf_ext_pgs, assuming they were allocated via * vm_page_alloc() and aren't associated with any object. Complement * to allocator from m_uiotombuf_nomap(). */ void mb_free_mext_pgs(struct mbuf *m) { vm_page_t pg; M_ASSERTEXTPG(m); for (int i = 0; i < m->m_epg_npgs; i++) { pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); vm_page_unwire_noq(pg); vm_page_free(pg); } } static struct mbuf * m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags) { struct mbuf *m, *mb, *prev; vm_page_t pg_array[MBUF_PEXT_MAX_PGS]; int error, length, i, needed; ssize_t total; - int pflags = malloc2vm_flags(how) | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | - VM_ALLOC_WIRED; + int pflags = malloc2vm_flags(how) | VM_ALLOC_NODUMP | VM_ALLOC_WIRED; MPASS((flags & M_PKTHDR) == 0); MPASS((how & M_ZERO) == 0); /* * len can be zero or an arbitrary large value bound by * the total data supplied by the uio. */ if (len > 0) total = MIN(uio->uio_resid, len); else total = uio->uio_resid; if (maxseg == 0) maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE; /* * If total is zero, return an empty mbuf. This can occur * for TLS 1.0 connections which send empty fragments as * a countermeasure against the known-IV weakness in CBC * ciphersuites. */ if (__predict_false(total == 0)) { mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs); if (mb == NULL) return (NULL); mb->m_epg_flags = EPG_FLAG_ANON; return (mb); } /* * Allocate the pages */ m = NULL; while (total > 0) { mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs); if (mb == NULL) goto failed; if (m == NULL) m = mb; else prev->m_next = mb; prev = mb; mb->m_epg_flags = EPG_FLAG_ANON; needed = length = MIN(maxseg, total); for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) { retry_page: - pg_array[i] = vm_page_alloc(NULL, 0, pflags); + pg_array[i] = vm_page_alloc_noobj(pflags); if (pg_array[i] == NULL) { if (how & M_NOWAIT) { goto failed; } else { vm_wait(NULL); goto retry_page; } } mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]); mb->m_epg_npgs++; } mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1); MBUF_EXT_PGS_ASSERT_SANITY(mb); total -= length; error = uiomove_fromphys(pg_array, 0, length, uio); if (error != 0) goto failed; mb->m_len = length; mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs; if (flags & M_PKTHDR) m->m_pkthdr.len += length; } return (m); failed: m_freem(m); return (NULL); } /* * Copy the contents of uio into a properly sized mbuf chain. */ struct mbuf * m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) { struct mbuf *m, *mb; int error, length; ssize_t total; int progress = 0; if (flags & M_EXTPG) return (m_uiotombuf_nomap(uio, how, len, align, flags)); /* * len can be zero or an arbitrary large value bound by * the total data supplied by the uio. */ if (len > 0) total = (uio->uio_resid < len) ? uio->uio_resid : len; else total = uio->uio_resid; /* * The smallest unit returned by m_getm2() is a single mbuf * with pkthdr. We can't align past it. */ if (align >= MHLEN) return (NULL); /* * Give us the full allocation or nothing. * If len is zero return the smallest empty mbuf. */ m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags); if (m == NULL) return (NULL); m->m_data += align; /* Fill all mbufs with uio data and update header information. */ for (mb = m; mb != NULL; mb = mb->m_next) { length = min(M_TRAILINGSPACE(mb), total - progress); error = uiomove(mtod(mb, void *), length, uio); if (error) { m_freem(m); return (NULL); } mb->m_len = length; progress += length; if (flags & M_PKTHDR) m->m_pkthdr.len += length; } KASSERT(progress == total, ("%s: progress != total", __func__)); return (m); } /* * Copy data to/from an unmapped mbuf into a uio limited by len if set. */ int m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len) { vm_page_t pg; int error, i, off, pglen, pgoff, seglen, segoff; M_ASSERTEXTPG(m); error = 0; /* Skip over any data removed from the front. */ off = mtod(m, vm_offset_t); off += m_off; if (m->m_epg_hdrlen != 0) { if (off >= m->m_epg_hdrlen) { off -= m->m_epg_hdrlen; } else { seglen = m->m_epg_hdrlen - off; segoff = off; seglen = min(seglen, len); off = 0; len -= seglen; error = uiomove(__DECONST(void *, &m->m_epg_hdr[segoff]), seglen, uio); } } pgoff = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) { pglen = m_epg_pagelen(m, i, pgoff); if (off >= pglen) { off -= pglen; pgoff = 0; continue; } seglen = pglen - off; segoff = pgoff + off; off = 0; seglen = min(seglen, len); len -= seglen; pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); error = uiomove_fromphys(&pg, segoff, seglen, uio); pgoff = 0; }; if (len != 0 && error == 0) { KASSERT((off + len) <= m->m_epg_trllen, ("off + len > trail (%d + %d > %d, m_off = %d)", off, len, m->m_epg_trllen, m_off)); error = uiomove(__DECONST(void *, &m->m_epg_trail[off]), len, uio); } return (error); } /* * Copy an mbuf chain into a uio limited by len if set. */ int m_mbuftouio(struct uio *uio, const struct mbuf *m, int len) { int error, length, total; int progress = 0; if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* Fill the uio with data from the mbufs. */ for (; m != NULL; m = m->m_next) { length = min(m->m_len, total - progress); if ((m->m_flags & M_EXTPG) != 0) error = m_unmapped_uiomove(m, 0, uio, length); else error = uiomove(mtod(m, void *), length, uio); if (error) return (error); progress += length; } return (0); } /* * Create a writable copy of the mbuf chain. While doing this * we compact the chain with a goal of producing a chain with * at most two mbufs. The second mbuf in this chain is likely * to be a cluster. The primary purpose of this work is to create * a writable packet for encryption, compression, etc. The * secondary goal is to linearize the data so the data can be * passed to crypto hardware in the most efficient manner possible. */ struct mbuf * m_unshare(struct mbuf *m0, int how) { struct mbuf *m, *mprev; struct mbuf *n, *mfirst, *mlast; int len, off; mprev = NULL; for (m = m0; m != NULL; m = mprev->m_next) { /* * Regular mbufs are ignored unless there's a cluster * in front of it that we can use to coalesce. We do * the latter mainly so later clusters can be coalesced * also w/o having to handle them specially (i.e. convert * mbuf+cluster -> cluster). This optimization is heavily * influenced by the assumption that we're running over * Ethernet where MCLBYTES is large enough that the max * packet size will permit lots of coalescing into a * single cluster. This in turn permits efficient * crypto operations, especially when using hardware. */ if ((m->m_flags & M_EXT) == 0) { if (mprev && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ } else { mprev = m; } continue; } /* * Writable mbufs are left alone (for now). */ if (M_WRITABLE(m)) { mprev = m; continue; } /* * Not writable, replace with a copy or coalesce with * the previous mbuf if possible (since we have to copy * it anyway, we try to reduce the number of mbufs and * clusters so that future work is easier). */ KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); /* NB: we only coalesce into a cluster or larger */ if (mprev != NULL && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ continue; } /* * Allocate new space to hold the copy and copy the data. * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by * splitting them into clusters. We could just malloc a * buffer and make it external but too many device drivers * don't know how to break up the non-contiguous memory when * doing DMA. */ n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(m0); return (NULL); } if (m->m_flags & M_PKTHDR) { KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR", __func__, m0, m)); m_move_pkthdr(n, m); } len = m->m_len; off = 0; mfirst = n; mlast = NULL; for (;;) { int cc = min(len, MCLBYTES); memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); n->m_len = cc; if (mlast != NULL) mlast->m_next = n; mlast = n; #if 0 newipsecstat.ips_clcopied++; #endif len -= cc; if (len <= 0) break; off += cc; n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(mfirst); m_freem(m0); return (NULL); } } n->m_next = m->m_next; if (mprev == NULL) m0 = mfirst; /* new head of chain */ else mprev->m_next = mfirst; /* replace old mbuf */ m_free(m); /* release old mbuf */ mprev = mfirst; } return (m0); } #ifdef MBUF_PROFILING #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/ struct mbufprofile { uintmax_t wasted[MP_BUCKETS]; uintmax_t used[MP_BUCKETS]; uintmax_t segments[MP_BUCKETS]; } mbprof; #define MP_MAXDIGITS 21 /* strlen("16,000,000,000,000,000,000") == 21 */ #define MP_NUMLINES 6 #define MP_NUMSPERLINE 16 #define MP_EXTRABYTES 64 /* > strlen("used:\nwasted:\nsegments:\n") */ /* work out max space needed and add a bit of spare space too */ #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE) #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES) char mbprofbuf[MP_BUFSIZE]; void m_profile(struct mbuf *m) { int segments = 0; int used = 0; int wasted = 0; while (m) { segments++; used += m->m_len; if (m->m_flags & M_EXT) { wasted += MHLEN - sizeof(m->m_ext) + m->m_ext.ext_size - m->m_len; } else { if (m->m_flags & M_PKTHDR) wasted += MHLEN - m->m_len; else wasted += MLEN - m->m_len; } m = m->m_next; } /* be paranoid.. it helps */ if (segments > MP_BUCKETS - 1) segments = MP_BUCKETS - 1; if (used > 100000) used = 100000; if (wasted > 100000) wasted = 100000; /* store in the appropriate bucket */ /* don't bother locking. if it's slightly off, so what? */ mbprof.segments[segments]++; mbprof.used[fls(used)]++; mbprof.wasted[fls(wasted)]++; } static void mbprof_textify(void) { int offset; char *c; uint64_t *p; p = &mbprof.wasted[0]; c = mbprofbuf; offset = snprintf(c, MP_MAXLINE + 10, "wasted:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.wasted[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.used[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "used:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.used[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.segments[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "segments:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.segments[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %jju", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif } static int mbprof_handler(SYSCTL_HANDLER_ARGS) { int error; mbprof_textify(); error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1); return (error); } static int mbprof_clr_handler(SYSCTL_HANDLER_ARGS) { int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error || !req->newptr) return (error); if (clear) { bzero(&mbprof, sizeof(mbprof)); } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, mbprof_handler, "A", "mbuf profiling statistics"); SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics"); #endif diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index e234bf2d6563..352c341d05f7 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,5553 +1,5552 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; struct bufqueue { struct mtx_padalign bq_lock; TAILQ_HEAD(, buf) bq_queue; uint8_t bq_index; uint16_t bq_subqueue; int bq_len; } __aligned(CACHE_LINE_SIZE); #define BQ_LOCKPTR(bq) (&(bq)->bq_lock) #define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) #define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) #define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) struct bufdomain { struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ struct bufqueue bd_dirtyq; struct bufqueue *bd_cleanq; struct mtx_padalign bd_run_lock; /* Constants */ long bd_maxbufspace; long bd_hibufspace; long bd_lobufspace; long bd_bufspacethresh; int bd_hifreebuffers; int bd_lofreebuffers; int bd_hidirtybuffers; int bd_lodirtybuffers; int bd_dirtybufthresh; int bd_lim; /* atomics */ int bd_wanted; int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers; int __aligned(CACHE_LINE_SIZE) bd_running; long __aligned(CACHE_LINE_SIZE) bd_bufspace; int __aligned(CACHE_LINE_SIZE) bd_freebuffers; } __aligned(CACHE_LINE_SIZE); #define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) #define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) #define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) #define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) #define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) #define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) #define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) #define BD_DOMAIN(bd) (bd - bdomain) static char *buf; /* buffer header pool */ static struct buf * nbufp(unsigned i) { return ((struct buf *)(buf + (sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf)) * i)); } caddr_t __read_mostly unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_range(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); static int buf_flush(struct vnode *vp, struct bufdomain *, int); static int flushbufqueues(struct vnode *, struct bufdomain *, int, int); static void buf_daemon(void); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); static inline struct bufdomain *bufdomain(struct buf *); static void bq_remove(struct bufqueue *bq, struct buf *bp); static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); static int buf_recycle(struct bufdomain *, bool kva); static void bq_init(struct bufqueue *bq, int qindex, int cpu, const char *lockname); static void bd_init(struct bufdomain *bd); static int bd_flushall(struct bufdomain *bd); static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS); static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS); static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers"); static counter_u64_t bufkvaspace; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace, __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_PROC(_vfs, OID_AUTO, lobufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace, __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L", "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_PROC(_vfs, OID_AUTO, hibufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace, __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh, __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L", "Bufspace consumed before waking the daemon to free some"); static counter_u64_t buffreekvacnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); static counter_u64_t bufdefragcnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW | CTLFLAG_STATS, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW | CTLFLAG_STATS, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I", "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers, __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I", "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers, __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I", "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh, __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I", "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers, __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I", "Target number of free buffers"); static int hifreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers, __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I", "Threshold for clean buffer recycling"); static counter_u64_t getnewbufcalls; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, "Number of calls to getnewbuf"); static counter_u64_t getnewbufrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts, "Number of times getnewbuf has had to restart a buffer acquisition"); static counter_u64_t mappingrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD, &mappingrestarts, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static counter_u64_t numbufallocfails; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static counter_u64_t notbufdflushes; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW | CTLFLAG_STATS, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); int maxbcachebuf = MAXBCACHEBUF; SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, "Maximum size of a buffer cache block"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign __exclusive_cache_line bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign __exclusive_cache_line rbreqlock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ /* Maximum number of buffer domains. */ #define BUF_DOMAINS 8 struct bufdomainset bdlodirty; /* Domains > lodirty */ struct bufdomainset bdhidirty; /* Domains > hidirty */ /* Configured number of clean queues. */ static int __read_mostly buf_domains; BITSET_DEFINE(bufdomainset, BUF_DOMAINS); struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS]; struct bufqueue __exclusive_cache_line bqempty; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS) { int error; int value; int i; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(int *)arg1 = value; for (i = 0; i < buf_domains; i++) *(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS) { long value; int error; int i; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(long *)arg1 = value; for (i = 0; i < buf_domains; i++) *(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #else static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; return (sysctl_handle_long(oidp, &lvalue, 0, req)); } #endif static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS) { int value; int i; value = 0; for (i = 0; i < buf_domains; i++) value += bdomain[i].bd_numdirtybuffers; return (sysctl_handle_int(oidp, &value, 0, req)); } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bd_clear: * * Clear a domain from the appropriate bitsets when dirtybuffers * is decremented. */ static void bd_clear(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bd_set: * * Set a domain in the appropriate bitsets when dirtybuffers * is incremented. */ static void bd_set(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(struct buf *bp) { struct bufdomain *bd; int num; bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bdirtywakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_clear(bd); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(struct buf *bp) { struct bufdomain *bd; int num; /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bd_wakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_set(bd); } /* * bufspace_daemon_wakeup: * * Wakeup the daemons responsible for freeing clean bufs. */ static void bufspace_daemon_wakeup(struct bufdomain *bd) { /* * avoid the lock if the daemon is running. */ if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) { BD_RUN_LOCK(bd); atomic_store_int(&bd->bd_running, 1); wakeup(&bd->bd_running); BD_RUN_UNLOCK(bd); } } /* * bufspace_daemon_wait: * * Sleep until the domain falls below a limit or one second passes. */ static void bufspace_daemon_wait(struct bufdomain *bd) { /* * Re-check our limits and sleep. bd_running must be * cleared prior to checking the limits to avoid missed * wakeups. The waker will adjust one of bufspace or * freebuffers prior to checking bd_running. */ BD_RUN_LOCK(bd); atomic_store_int(&bd->bd_running, 0); if (bd->bd_bufspace < bd->bd_bufspacethresh && bd->bd_freebuffers > bd->bd_lofreebuffers) { msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP, "-", hz); } else { /* Avoid spurious wakeups while running. */ atomic_store_int(&bd->bd_running, 1); BD_RUN_UNLOCK(bd); } } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { struct bufdomain *bd; long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); bd = bufdomain(bp); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bd->bd_bufspace, -diff); } else if (diff > 0) { space = atomic_fetchadd_long(&bd->bd_bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && space + diff >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(struct bufdomain *bd, int size, bool metadata) { long limit, new; long space; if (metadata) limit = bd->bd_maxbufspace; else limit = bd->bd_hibufspace; space = atomic_fetchadd_long(&bd->bd_bufspace, size); new = space + size; if (new > limit) { atomic_subtract_long(&bd->bd_bufspace, size); return (ENOSPC); } /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(struct bufdomain *bd, int size) { atomic_subtract_long(&bd->bd_bufspace, size); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. bd_wanted must be set prior to polling for space. The * operation must be re-tried on return. */ static void bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; BD_LOCK(bd); while (bd->bd_wanted) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { BD_UNLOCK(bd); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, bd, flushbufqtarget); td->td_pflags &= norunbuf; BD_LOCK(bd); if (fl != 0) continue; if (bd->bd_wanted == 0) break; } error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd), (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } BD_UNLOCK(bd); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void *arg) { struct bufdomain *bd; EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread, SHUTDOWN_PRI_LAST + 100); bd = arg; for (;;) { kthread_suspend_check(); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bd->bd_bufspace > bd->bd_lobufspace || bd->bd_freebuffers < bd->bd_hifreebuffers) { if (buf_recycle(bd, false) != 0) { if (bd_flushall(bd)) continue; /* * Speedup dirty if we've run out of clean * buffers. This is possible in particular * because softdep may held many bufs locked * pending writes to other bufs which are * marked for delayed write, exhausting * clean space until they are written. */ bd_speedup(); BD_LOCK(bd); if (bd->bd_wanted) { msleep(&bd->bd_wanted, BD_LOCKPTR(bd), PRIBIO|PDROP, "bufspace", hz/10); } else BD_UNLOCK(bd); } maybe_yield(); } bufspace_daemon_wait(bd); } } /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { /* * This function and its results are protected by higher level * synchronization requiring vnode and buf locks to page in and * validate pages. */ if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * Adjust the maxbcachbuf tunable. */ static void maxbcachebuf_adjust(void) { int i; /* * maxbcachebuf must be a power of 2 >= MAXBSIZE. */ i = 2; while (i * 2 <= maxbcachebuf) i *= 2; maxbcachebuf = i; if (maxbcachebuf < MAXBSIZE) maxbcachebuf = MAXBSIZE; if (maxbcachebuf > maxphys) maxbcachebuf = maxphys; if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) printf("maxbcachebuf=%d\n", maxbcachebuf); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * With KASAN or KMSAN enabled, the kernel map is shadowed. Account for * this when sizing maps based on the amount of physical memory * available. */ #if defined(KASAN) physmem_est = (physmem_est * KASAN_SHADOW_SCALE) / (KASAN_SHADOW_SCALE + 1); #elif defined(KMSAN) physmem_est /= 3; /* * KMSAN cannot reliably determine whether buffer data is initialized * unless it is updated through a KVA mapping. */ unmapped_buf_allowed = 0; #endif /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); maxbcachebuf_adjust(); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > maxphys) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / maxphys; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } if (nswbuf == 0) { nswbuf = min(nbuf / 4, 256); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; } /* * Reserve space for the buffer cache buffers */ buf = (char *)v; v = (caddr_t)buf + (sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf)) * nbuf; return (v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); unmapped_buf = (caddr_t)kva_alloc(maxphys); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = nbufp(i); bzero(bp, sizeof(*bp) + sizeof(vm_page_t) * atop(maxbcachebuf)); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_NONE; bp->b_domain = -1; bp->b_subqueue = mp_maxid + 1; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); bq_insert(&bqempty, bp, false); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS); for (i = 0 ; i < buf_domains; i++) { struct bufdomain *bd; bd = &bdomain[i]; bd_init(bd); bd->bd_freebuffers = nbuf / buf_domains; bd->bd_hifreebuffers = hifreebuffers / buf_domains; bd->bd_lofreebuffers = lofreebuffers / buf_domains; bd->bd_bufspace = 0; bd->bd_maxbufspace = maxbufspace / buf_domains; bd->bd_hibufspace = hibufspace / buf_domains; bd->bd_lobufspace = lobufspace / buf_domains; bd->bd_bufspacethresh = bufspacethresh / buf_domains; bd->bd_numdirtybuffers = 0; bd->bd_hidirtybuffers = hidirtybuffers / buf_domains; bd->bd_lodirtybuffers = lodirtybuffers / buf_domains; bd->bd_dirtybufthresh = dirtybufthresh / buf_domains; /* Don't allow more than 2% of bufs in the per-cpu caches. */ bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus; } getnewbufcalls = counter_u64_alloc(M_WAITOK); getnewbufrestarts = counter_u64_alloc(M_WAITOK); mappingrestarts = counter_u64_alloc(M_WAITOK); numbufallocfails = counter_u64_alloc(M_WAITOK); notbufdflushes = counter_u64_alloc(M_WAITOK); buffreekvacnt = counter_u64_alloc(M_WAITOK); bufdefragcnt = counter_u64_alloc(M_WAITOK); bufkvaspace = counter_u64_alloc(M_WAITOK); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + maxphys, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int i, iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); kern_sync(curthread); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (i = nbuf - 1; i >= 0; i--) { bp = nbufp(i); if (isbufbusy(bp)) nbusy++; } if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); kern_sync(curthread); #ifdef PREEMPTION /* * Spin for a while to allow interrupt threads to run. */ DELAY(50000 * iter); #else /* * Context switch several times to allow interrupt * threads to run. */ for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL); DELAY(1000); } #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (i = nbuf - 1; i >= 0; i--) { bp = nbufp(i); if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (!KERNEL_PANICKED()) vfs_unmountall(); } swapoff_all(); DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } static inline struct bufdomain * bufdomain(struct buf *bp) { return (&bdomain[bp->b_domain]); } static struct bufqueue * bufqueue(struct buf *bp) { switch (bp->b_qindex) { case QUEUE_NONE: /* FALLTHROUGH */ case QUEUE_SENTINEL: return (NULL); case QUEUE_EMPTY: return (&bqempty); case QUEUE_DIRTY: return (&bufdomain(bp)->bd_dirtyq); case QUEUE_CLEAN: return (&bufdomain(bp)->bd_subq[bp->b_subqueue]); default: break; } panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex); } /* * Return the locked bufqueue that bp is a member of. */ static struct bufqueue * bufqueue_acquire(struct buf *bp) { struct bufqueue *bq, *nbq; /* * bp can be pushed from a per-cpu queue to the * cleanq while we're waiting on the lock. Retry * if the queues don't match. */ bq = bufqueue(bp); BQ_LOCK(bq); for (;;) { nbq = bufqueue(bp); if (bq == nbq) break; BQ_UNLOCK(bq); BQ_LOCK(nbq); bq = nbq; } return (bq); } /* * binsfree: * * Insert the buffer into the appropriate free list. Requires a * locked buffer on entry and buffer is unlocked before return. */ static void binsfree(struct buf *bp, int qindex) { struct bufdomain *bd; struct bufqueue *bq; KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY, ("binsfree: Invalid qindex %d", qindex)); BUF_ASSERT_XLOCKED(bp); /* * Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) { if (bp->b_qindex == qindex) { bp->b_flags |= B_REUSE; bp->b_flags &= ~B_REMFREE; BUF_UNLOCK(bp); return; } bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } bd = bufdomain(bp); if (qindex == QUEUE_CLEAN) { if (bd->bd_lim != 0) bq = &bd->bd_subq[PCPU_GET(cpuid)]; else bq = bd->bd_cleanq; } else bq = &bd->bd_dirtyq; bq_insert(bq, bp, true); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1); MPASS((bp->b_flags & B_MAXPHYS) == 0); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int domain, int flags) { struct buf *bp; int i; BQ_LOCK(&bqempty); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bqempty.bq_queue); if (bp == NULL) break; bq_remove(&bqempty, bp); store[i] = bp; } BQ_UNLOCK(&bqempty); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { struct bufqueue *bq; struct buf *bp; int i; bq = &bqempty; BQ_LOCK(bq); for (i = 0; i < cnt; i++) { bp = store[i]; /* Inline bq_insert() to batch locking. */ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; } BQ_UNLOCK(bq); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(struct bufdomain *bd) { struct buf *bp; int freebufs, error; /* * We can only run out of bufs in the buf zone if the average buf * is less than BKVASIZE. In this case the actual wait/block will * come from buf_reycle() failing to flush one of these small bufs. */ bp = NULL; freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1); if (freebufs > 0) bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { atomic_add_int(&bd->bd_freebuffers, 1); bufspace_daemon_wakeup(bd); counter_u64_add(numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition below threshold. */ if (freebufs == bd->bd_lofreebuffers) bufspace_daemon_wakeup(bd); error = BUF_LOCK(bp, LK_EXCLUSIVE, NULL); KASSERT(error == 0, ("%s: BUF_LOCK on free buf %p: %d.", __func__, bp, error)); (void)error; KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); MPASS((bp->b_flags & B_MAXPHYS) == 0); bp->b_domain = BD_DOMAIN(bd); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_recycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_recycle(struct bufdomain *bd, bool kva) { struct bufqueue *bq; struct buf *bp, *nbp; if (kva) counter_u64_add(bufdefragcnt, 1); nbp = NULL; bq = bd->bd_cleanq; BQ_LOCK(bq); KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd), ("buf_recycle: Locks don't match")); nbp = TAILQ_FIRST(&bq->bq_queue); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Implement a second chance algorithm for frequently * accessed buffers. */ if ((bp->b_flags & B_REUSE) != 0) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~B_REUSE; BUF_UNLOCK(bp); continue; } /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == QUEUE_CLEAN, ("buf_recycle: inconsistent queue %d bp %p", bp->b_qindex, bp)); KASSERT(bp->b_domain == BD_DOMAIN(bd), ("getnewbuf: queue domain %d doesn't match request %d", bp->b_domain, (int)BD_DOMAIN(bd))); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bq_remove(bq, bp); BQ_UNLOCK(bq); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); BQ_LOCK(bq); nbp = TAILQ_FIRST(&bq->bq_queue); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } bd->bd_wanted = 1; BQ_UNLOCK(bq); return (ENOBUFS); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct bufqueue *bq; bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } static void bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname) { mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF); TAILQ_INIT(&bq->bq_queue); bq->bq_len = 0; bq->bq_index = qindex; bq->bq_subqueue = subqueue; } static void bd_init(struct bufdomain *bd) { int i; bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1]; bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock"); bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock"); for (i = 0; i <= mp_maxid; i++) bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i, "bufq clean subqueue lock"); mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF); } /* * bq_remove: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bq_remove(struct bufqueue *bq, struct buf *bp) { CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bq_remove: buffer %p not on a queue.", bp)); KASSERT(bufqueue(bp) == bq, ("bq_remove: Remove buffer %p from wrong queue.", bp)); BQ_ASSERT_LOCKED(bq); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } KASSERT(bq->bq_len >= 1, ("queue %d underflow", bp->b_qindex)); TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); bq->bq_len--; bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~(B_REMFREE | B_REUSE); } static void bd_flush(struct bufdomain *bd, struct bufqueue *bq) { struct buf *bp; BQ_ASSERT_LOCKED(bq); if (bq != bd->bd_cleanq) { BD_LOCK(bd); while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp, b_freelist); bp->b_subqueue = bd->bd_cleanq->bq_subqueue; } bd->bd_cleanq->bq_len += bq->bq_len; bq->bq_len = 0; } if (bd->bd_wanted) { bd->bd_wanted = 0; wakeup(&bd->bd_wanted); } if (bq != bd->bd_cleanq) BD_UNLOCK(bd); } static int bd_flushall(struct bufdomain *bd) { struct bufqueue *bq; int flushed; int i; if (bd->bd_lim == 0) return (0); flushed = 0; for (i = 0; i <= mp_maxid; i++) { bq = &bd->bd_subq[i]; if (bq->bq_len == 0) continue; BQ_LOCK(bq); bd_flush(bd, bq); BQ_UNLOCK(bq); flushed++; } return (flushed); } static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock) { struct bufdomain *bd; if (bp->b_qindex != QUEUE_NONE) panic("bq_insert: free buffer %p onto another queue?", bp); bd = bufdomain(bp); if (bp->b_flags & B_AGE) { /* Place this buf directly on the real queue. */ if (bq->bq_index == QUEUE_CLEAN) bq = bd->bd_cleanq; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist); } else { BQ_LOCK(bq); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); } bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; bp->b_subqueue = bq->bq_subqueue; /* * Unlock before we notify so that we don't wakeup a waiter that * fails a trylock on the buf and sleeps again. */ if (unlock) BUF_UNLOCK(bp); if (bp->b_qindex == QUEUE_CLEAN) { /* * Flush the per-cpu queue and notify any waiters. */ if (bd->bd_wanted || (bq != bd->bd_cleanq && bq->bq_len >= bd->bd_lim)) bd_flush(bd, bq); } BQ_UNLOCK(bq); } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); counter_u64_add(bufkvaspace, -bp->b_kvasize); counter_u64_add(buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); MPASS((bp->b_flags & B_MAXPHYS) == 0); KASSERT(maxsize <= maxbcachebuf, ("bufkva_alloc kva too large %d %u", maxsize, maxbcachebuf)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; counter_u64_add(bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { bool done; int q; int i; done = false; for (i = 0; i < 5; i++) { for (q = 0; q < buf_domains; q++) if (buf_recycle(&bdomain[q], true) != 0) done = true; if (done) break; } return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ static void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *)) { struct buf *rabp; struct thread *td; int i; td = curthread; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) != 0) { brelse(rabp); continue; } #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ td->td_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { rabp->b_flags |= B_CKHASH; rabp->b_ckhashcalc = ckhashfunc; } rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. * * The blkno parameter is the logical block being requested. Normally * the mapping of logical block number to disk block address is done * by calling VOP_BMAP(). However, if the mapping is already known, the * disk block address can be passed using the dblkno parameter. If the * disk block address is not known, then the same value should be passed * for blkno and dblkno. */ int breadn_flags(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { struct buf *bp; struct thread *td; int error, readwait, rv; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); td = curthread; /* * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags * are specified. */ error = getblkx(vp, blkno, dblkno, size, 0, 0, flags, &bp); if (error != 0) { *bpp = NULL; return (error); } KASSERT(blkno == bp->b_lblkno, ("getblkx returned buffer for blkno %jd instead of blkno %jd", (intmax_t)bp->b_lblkno, (intmax_t)blkno)); flags &= ~GB_NOSPARSE; *bpp = bp; /* * If not found in cache, do some I/O */ readwait = 0; if ((bp->b_flags & B_CACHE) == 0) { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, bp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { bp->b_flags |= B_CKHASH; bp->b_ckhashcalc = ckhashfunc; } if ((flags & GB_CVTENXIO) != 0) bp->b_xflags |= BX_CVTENXIO; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. */ breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc); rv = 0; if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) atomic_add_long(&barrierwrites, 1); oldflags = bp->b_flags; KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; struct bufdomain *bd; bd = &bdomain[bo->bo_domain]; if (bo->bo_dirty.bv_cnt > bd->bd_dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > bd->bd_dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buf_track(bp, __func__); /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(bp); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(bp); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (buf_dirty_count_severe()) { mtx_lock(&bdirtylock); while (buf_dirty_count_severe()) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty)); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { struct mount *v_mnt; int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags &= ~B_IOSTARTED; } else { KASSERT((bp->b_flags & B_IOSTARTED) == 0, ("brelse: SU io not finished bp %p", bp)); } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_flags & B_INVALONERR)) { /* * Forced invalidation of dirty buffer contents, to be used * after a failed write in the rare case that the loss of the * contents is acceptable. The buffer is invalidated and * freed. */ bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; bp->b_flags &= ~(B_ASYNC | B_CACHE); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. All errors except ENXIO (which * means the device is gone) are treated as being * transient. * * XXX Treating EIO as transient is not correct; the * contract with the local storage device drivers is that * they will only return EIO once the I/O is no longer * retriable. Network I/O also respects this through the * guarantees of TCP and/or the internal retries of NFS. * ENOMEM might be transient, but we also have no way of * knowing when its ok to retry/reschedule. In general, * this entire case should be made obsolete through better * error handling/recovery and resource scheduling. * * Do this also for buffers that failed with ENXIO, but have * non-empty dependencies - the soft updates code might need * to access the buffer to untangle them. * * Must clear BIO_ERROR to prevent pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O, or we were asked to free or not * cache the buffer, or we failed to write to a device that's * no longer present. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(bp); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL; if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 || vn_isdisk(bp->b_vp) || (bp->b_flags & B_DELWRI) == 0)) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } buf_track(bp, __func__); /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT); bp->b_xflags &= ~(BX_CVTENXIO); /* binsfree unlocks bp. */ binsfree(bp, qindex); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); bp->b_xflags &= ~(BX_CVTENXIO); if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags &= ~B_IOSTARTED; } else { KASSERT((bp->b_flags & B_IOSTARTED) == 0, ("bqrelse: SU io not finished bp %p", bp)); } if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } buf_track(bp, __func__); /* binsfree unlocks bp. */ binsfree(bp, qindex); return; out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp __unused; int i, iosize, resid; bool bogus; obj = bp->b_bufobj->bo_object; KASSERT(blockcount_read(&obj->paging_in_progress) >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", blockcount_read(&obj->paging_in_progress), bp->b_npages)); vp = bp->b_vp; VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vp->v_object != NULL, vp); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = false; iosize = bp->b_bcount - bp->b_resid; for (i = 0; i < bp->b_npages; i++) { resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = true; m = vm_page_relookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int flags, i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_busy_acquire(m, VM_ALLOC_SBUSY); if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vm_page_sunbusy(m); vm_page_release_locked(m, flags); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int flags, i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); /* * The object lock is needed only if we will attempt to free pages. */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; if ((bp->b_flags & B_DIRECT) != 0) { flags |= VPR_TRYFREE; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); } else { obj = NULL; } for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; if (obj != NULL) vm_page_release_locked(m, flags); else vm_page_release(m, flags); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; if (bp->b_npages < desiredpages) { KASSERT(desiredpages <= atop(maxbcachebuf), ("vfs_vmio_extend past maxbcachebuf %p %d %u", bp, desiredpages, maxbcachebuf)); /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ (void)vm_page_grab_pages_unlocked(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages); bp->b_npages = desiredpages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = maxphys / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct bufdomain *bd; struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; if (vp == NULL) bd = &bdomain[0]; else bd = &bdomain[vp->v_bufobj.bo_domain]; counter_u64_add(getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(bd, maxsize, metadata) != 0) { counter_u64_add(getnewbufrestarts, 1); continue; } reserved = true; if ((bp = buf_alloc(bd)) == NULL) { counter_u64_add(getnewbufrestarts, 1); continue; } if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while (buf_recycle(bd, false) == 0); if (reserved) bufspace_release(bd, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(bd, vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, struct bufdomain *bd, int target) { int flushed; flushed = flushbufqueues(vp, bd, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, bd, target, 1); } return (flushed); } static void buf_daemon() { struct bufdomain *bd; int speedupreq; int lodirty; int i; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread, SHUTDOWN_PRI_LAST + 100); /* * Start the buf clean daemons as children threads. */ for (i = 0 ; i < buf_domains; i++) { int error; error = kthread_add((void (*)(void *))bufspace_daemon, &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i); if (error) panic("error %d spawning bufspace daemon", error); } /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kthread_suspend_check(); /* * Save speedupreq for this pass and reset to capture new * requests. */ speedupreq = bd_speedupreq; bd_speedupreq = 0; /* * Flush each domain sequentially according to its level and * the speedup request. */ for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; if (speedupreq) lodirty = bd->bd_numdirtybuffers / 2; else lodirty = bd->bd_lodirtybuffers; while (bd->bd_numdirtybuffers > lodirty) { if (buf_flush(NULL, bd, bd->bd_numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW | CTLFLAG_STATS, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target, int flushdeps) { struct bufqueue *bq; struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int error; bool unlock; flushed = 0; bq = &bd->bd_dirtyq; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); while (flushed != target) { maybe_yield(); BQ_LOCK(bq); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel, b_freelist); } else { BQ_UNLOCK(bq); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { BQ_UNLOCK(bq); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); BQ_UNLOCK(bq); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); counter_u64_add(notbufdflushes, 1); } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } BQ_LOCK(bq); TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { return (gbincore_unlocked(bo, blkno)); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ bool inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m, n; vm_ooffset_t off; int valid; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return (true); if (vp->v_mount == NULL) return (false); obj = vp->v_object; if (obj == NULL) return (false); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup_unlocked(obj, OFF_TO_IDX(off + toff)); recheck: if (m == NULL) return (false); tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); /* * Consider page validity only if page mapping didn't change * during the check. */ valid = vm_page_is_valid(m, (vm_offset_t)((toff + off) & PAGE_MASK), tinc); n = vm_page_lookup_unlocked(obj, OFF_TO_IDX(off + toff)); if (m != n) { m = n; goto recheck; } if (!valid) return (false); } return (true); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); vfs_busy_pages_acquire(bp); vfs_setdirty_range(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } vfs_busy_pages_release(bp); } static void vfs_setdirty_range(struct buf *bp) { vm_offset_t boffset; vm_offset_t eoffset; int i; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } counter_u64_add(mappingrestarts, 1); bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; int error; error = getblkx(vp, blkno, blkno, size, slpflag, slptimeo, flags, &bp); if (error != 0) return (NULL); return (bp); } /* * getblkx: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whose * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. * * The blkno parameter is the logical block being requested. Normally * the mapping of logical block number to disk block address is done * by calling VOP_BMAP(). However, if the mapping is already known, the * disk block address can be passed using the dblkno parameter. If the * disk block address is not known, then the same value should be passed * for blkno and dblkno. */ int getblkx(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, int slpflag, int slptimeo, int flags, struct buf **bpp) { struct buf *bp; struct bufobj *bo; daddr_t d_blkno; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > maxbcachebuf) panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, maxbcachebuf); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; d_blkno = dblkno; /* Attempt lockless lookup first. */ bp = gbincore_unlocked(bo, blkno); if (bp == NULL) { /* * With GB_NOCREAT we must be sure about not finding the buffer * as it may have been reassigned during unlocked lookup. */ if ((flags & GB_NOCREAT) != 0) goto loop; goto newbuf_unlocked; } error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL, "getblku", 0, 0); if (error != 0) goto loop; /* Verify buf identify has not changed since lookup. */ if (bp->b_bufobj == bo && bp->b_lblkno == blkno) goto foundbuf_fastpath; /* It changed, fallback to locked lookup. */ BUF_UNLOCK_RAW(bp); loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_INTERLOCK | ((flags & GB_LOCK_NOWAIT) ? LK_NOWAIT : LK_SLEEPFAIL); error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error != 0) return (error); foundbuf_fastpath: /* If recursed, assume caller knows the rules. */ if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); newbuf_unlocked: /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return (EEXIST); bsize = vn_isdisk(vp) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); if ((flags & GB_NOSPARSE) != 0 && vmio && !vn_isdisk(vp)) { error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0); KASSERT(error != EOPNOTSUPP, ("GB_NOSPARSE from fs not supporting bmap, vp %p", vp)); if (error != 0) return (error); if (d_blkno == -1) return (EJUSTRETURN); } bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return (ETIMEDOUT); /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; bufspace_release(bufdomain(bp), maxsize); brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_lblkno = blkno; bp->b_blkno = d_blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); end: buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); *bpp = bp; return (0); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; static struct bio_queue nondump_bios; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; biotrack(bp, __func__); /* * Avoid completing I/O when dumping after a panic since that may * result in a deadlock in the filesystem or pager code. Note that * this doesn't affect dumps that were started manually since we aim * to keep the system usable after it has been resumed. */ if (__predict_false(dumping && SCHEDULER_STOPPED())) { TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue); return; } if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, atop(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else done(bp); } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location) { buf_track(bp->bio_track_bp, location); } #endif /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * bufdone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if ((bp->b_flags & B_CKHASH) != 0) { KASSERT(bp->b_iocmd == BIO_READ, ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd)); KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp)); (*bp->b_ckhashcalc)(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); if (dropobj) bufobj_wdrop(dropobj); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_relookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Acquire a shared busy on all pages in the buf. */ void vfs_busy_pages_acquire(struct buf *bp) { int i; for (i = 0; i < bp->b_npages; i++) vm_page_busy_acquire(bp->b_pages[i], VM_ALLOC_SBUSY); } void vfs_busy_pages_release(struct buf *bp) { int i; for (i = 0; i < bp->b_npages; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { vm_object_t obj; vm_ooffset_t foff; vm_page_t m; int i; bool bogus; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, bp->b_npages); vfs_busy_pages_acquire(bp); } if (bp->b_bufsize != 0) vfs_setdirty_range(bp); bogus = false; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; vm_page_assert_sbusied(m); /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (vm_page_all_valid(m) && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); /* * Busy may not be strictly necessary here because the pages are * unlikely to be fully valid and the vnode lock will synchronize * their access via getpages. It is grabbed for consistency with * other page validation. */ vfs_busy_pages_acquire(bp); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } vfs_busy_pages_release(bp); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; vfs_busy_pages_acquire(bp); sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } vm_page_set_valid_range(bp->b_pages[i], j * DEV_BSIZE, roundup2(ea - sa, DEV_BSIZE)); } vfs_busy_pages_release(bp); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * Update buffer flags based on I/O request parameters, optionally releasing the * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, * where they may be placed on a page queue (VMIO) or freed immediately (direct * I/O). Otherwise the buffer is released to the cache. */ static void b_io_dismiss(struct buf *bp, int ioflag, bool release) { KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, ("buf %p non-VMIO noreuse", bp)); if ((ioflag & IO_DIRECT) != 0) bp->b_flags |= B_DIRECT; if ((ioflag & IO_EXT) != 0) bp->b_xflags |= BX_ALTDATA; if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; if ((ioflag & IO_NOREUSE) != 0) bp->b_flags |= B_NOREUSE; if (release) brelse(bp); } else if (release) bqrelse(bp); } void vfs_bio_brelse(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, true); } void vfs_bio_set_flags(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, false); } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; MPASS((bp->b_flags & B_MAXPHYS) == 0); KASSERT(to - from <= maxbcachebuf, ("vm_hold_load_pages too large %p %#jx %#jx %u", bp, (uintmax_t)from, (uintmax_t)to, maxbcachebuf)); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ - p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) | - VM_ALLOC_WAITOK); + p = vm_page_alloc_noobj(VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | + VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) | VM_ALLOC_WAITOK); pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; vm_page_unwire_noq(p); vm_page_free(p); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, void *uaddr, size_t len, int mapbuf) { vm_prot_t prot; int pidx; MPASS((bp->b_flags & B_MAXPHYS) != 0); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)uaddr, len, prot, bp->b_pages, PBUF_PAGES); if (pidx < 0) return (-1); bp->b_bufsize = len; bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)uaddr) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return (0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i __unused; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } /* * Initialize a struct bufobj before use. Memory is assumed zero filled. */ void bufobj_init(struct bufobj *bo, void *private) { static volatile int bufobj_cleanq; bo->bo_domain = atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains; rw_init(BO_LOCKPTR(bo), "bufobj interlock"); bo->bo_private = private; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } /* * The MIPS pmap code currently doesn't handle aliased pages. * The VIPT caches may not handle page aliasing themselves, leading * to data corruption. * * As such, this code makes a system extremely unhappy if said * system doesn't support unaliasing the above situation in hardware. * Some "recent" systems (eg some mips24k/mips74k cores) don't enable * this feature at build time, so it has to be handled in software. * * Once the MIPS pmap/cache code grows to support this function on * earlier chips, it should be flipped back off. */ #ifdef __mips__ static int buf_pager_relbuf = 1; #else static int buf_pager_relbuf = 0; #endif SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; struct mount *mp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bo_bs, bsize; int br_flags, error, i, pgsin, pgsin_a, pgsin_b; bool redo, lpart; object = vp->v_object; mp = vp->v_mount; error = 0; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); /* * Change the meaning of la from where the last requested page starts * to where it ends, because that's the end of the requested region * and the start of the potential read-ahead region. */ la += PAGE_SIZE; lpart = la > object->un_pager.vnp.vnp_size; error = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)), &bo_bs); if (error != 0) return (VM_PAGER_ERROR); /* * Calculate read-ahead, behind and total pages. */ pgsin = count; lb = IDX_TO_OFF(ma[0]->pindex); pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); pgsin += pgsin_b; if (rbehind != NULL) *rbehind = pgsin_b; pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, PAGE_SIZE) - la); pgsin += pgsin_a; if (rahead != NULL) *rahead = pgsin_a; VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, pgsin); br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; again: for (i = 0; i < count; i++) { if (ma[i] != bogus_page) vm_page_busy_downgrade(ma[i]); } lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; if (m == bogus_page) continue; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (vm_page_all_valid(m)) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; error = get_blksize(vp, lbn, &bsize); if (error == 0) error = bread_gb(vp, lbn, bsize, curthread->td_ucred, br_flags, &bp); if (error != 0) goto end_pages; if (bp->b_rcred == curthread->td_ucred) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || !vm_page_all_valid(m)) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || vm_page_all_valid(m) || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { if (!vm_page_none_valid(m) && !vm_page_all_valid(m)) vm_page_zero_invalid(m, TRUE); } next_page:; } end_pages: redo = false; for (i = 0; i < count; i++) { if (ma[i] == bogus_page) continue; if (vm_page_busy_tryupgrade(ma[i]) == 0) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab_unlocked(object, ma[i]->pindex, VM_ALLOC_NORMAL); } /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (!vm_page_all_valid(ma[i])) redo = true; } if (redo && error == 0) goto again; return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; #ifdef FULL_BUF_TRACKING uint32_t i, j; #endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS); db_printf("b_vflags=0x%b b_ioflags0x%b\n", (u_int)bp->b_vflags, PRINT_BUF_VFLAGS, (u_int)bp->b_ioflags, PRINT_BIO_FLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p\n, b_blkno = %jd, b_lblkno = %jd, " "b_vp = %p, b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_vp, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } BUF_LOCKPRINTINFO(bp); #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); i = bp->b_io_tcnt % BUF_TRACKING_SIZE; for (j = 1; j <= BUF_TRACKING_SIZE; j++) { if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) continue; db_printf(" %2u: %s\n", j, bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); } #elif defined(BUF_TRACKING) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); } DB_SHOW_COMMAND(bufqueues, bufqueues) { struct bufdomain *bd; struct buf *bp; long total; int i, j, cnt; db_printf("bqempty: %d\n", bqempty.bq_len); for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; db_printf("Buf domain %d\n", i); db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers); db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers); db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers); db_printf("\n"); db_printf("\tbufspace\t%ld\n", bd->bd_bufspace); db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace); db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace); db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace); db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh); db_printf("\n"); db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers); db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers); db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers); db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh); db_printf("\n"); total = 0; TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tcleanq count\t%d (%ld)\n", bd->bd_cleanq->bq_len, total); total = 0; TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tdirtyq count\t%d (%ld)\n", bd->bd_dirtyq.bq_len, total); db_printf("\twakeup\t\t%d\n", bd->bd_wanted); db_printf("\tlim\t\t%d\n", bd->bd_lim); db_printf("\tCPU "); for (j = 0; j <= mp_maxid; j++) db_printf("%d, ", bd->bd_subq[j].bq_len); db_printf("\n"); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) { bp = nbufp(j); if (bp->b_domain == i && BUF_ISLOCKED(bp)) { cnt++; total += bp->b_bufsize; } } db_printf("\tLocked buffers: %d space %ld\n", cnt, total); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) { bp = nbufp(j); if (bp->b_domain == i) { cnt++; total += bp->b_bufsize; } } db_printf("\tTotal buffers: %d space %ld\n", cnt, total); } } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = nbufp(i); if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); if (db_pager_quit) break; } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = nbufp(i); if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c index d459bbeab1b2..a41614c5457b 100644 --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -1,3769 +1,3765 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: src/sys/i386/i386/pmap.c,v 1.250.2.8 2000/11/21 00:09:14 ps * JNPR: pmap.c,v 1.11.2.1 2007/08/16 11:51:06 girish */ /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_pmap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef PMAP_DEBUG #if !defined(DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_seg_index(v) (((v) >> SEGSHIFT) & (NPDEPG - 1)) #define pmap_pde_index(v) (((v) >> PDRSHIFT) & (NPDEPG - 1)) #define pmap_pte_index(v) (((v) >> PAGE_SHIFT) & (NPTEPG - 1)) #define pmap_pde_pindex(v) ((v) >> PDRSHIFT) #ifdef __mips_n64 #define NUPDE (NPDEPG * NPDEPG) #define NUSERPGTBLS (NUPDE + NPDEPG) #else #define NUPDE (NPDEPG) #define NUSERPGTBLS (NUPDE) #endif #define is_kernel_pmap(x) ((x) == kernel_pmap) struct pmap kernel_pmap_store; pd_entry_t *kernel_segmap; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static int need_local_mappings; static int nkpt; unsigned pmap_max_asid; /* max ASID supported by the system */ #define PMAP_ASID_RESERVED 0 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; static void pmap_asid_alloc(pmap_t pmap); static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static vm_page_t pmap_alloc_direct_page(unsigned int index, int req); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_grow_direct_page(int req); static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t pde); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, vm_page_t m); static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte); static void pmap_invalidate_all(pmap_t pmap); static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t); static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot); static void pmap_invalidate_page_action(void *arg); static void pmap_invalidate_range_action(void *arg); static void pmap_update_page_action(void *arg); #ifndef __mips_n64 static vm_offset_t crashdumpva; /* * These functions are for high memory (memory above 512Meg in 32 bit) support. * The highmem area does not have a KSEG0 mapping, and we need a mechanism to * do temporary per-CPU mappings for pmap_zero_page, pmap_copy_page etc. * * At bootup, we reserve 2 virtual pages per CPU for mapping highmem pages. To * access a highmem physical address on a CPU, we map the physical address to * the reserved virtual address for the CPU in the kernel pagetable. */ static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; if (need_local_mappings == 0) return; CPU_FOREACH(i) { pc = pcpu_find(i); /* * Skip if the mapping has already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap1_addr != 0) continue; pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("%s: unable to allocate KVA", __func__); pc->pc_cmap1_ptep = pmap_pte(kernel_pmap, pages); pc->pc_cmap2_ptep = pmap_pte(kernel_pmap, pages + PAGE_SIZE); pc->pc_qmap_ptep = pmap_pte(kernel_pmap, pages + (PAGE_SIZE * 2)); pc->pc_cmap1_addr = pages; pc->pc_cmap2_addr = pages + PAGE_SIZE; pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); static __inline void pmap_alloc_lmem_map(void) { PCPU_SET(cmap1_addr, virtual_avail); PCPU_SET(cmap2_addr, virtual_avail + PAGE_SIZE); PCPU_SET(cmap1_ptep, pmap_pte(kernel_pmap, virtual_avail)); PCPU_SET(cmap2_ptep, pmap_pte(kernel_pmap, virtual_avail + PAGE_SIZE)); PCPU_SET(qmap_addr, virtual_avail + (2 * PAGE_SIZE)); PCPU_SET(qmap_ptep, pmap_pte(kernel_pmap, virtual_avail + (2 * PAGE_SIZE))); crashdumpva = virtual_avail + (3 * PAGE_SIZE); virtual_avail += PAGE_SIZE * 4; } static __inline vm_offset_t pmap_lmem_map1(vm_paddr_t phys) { critical_enter(); *PCPU_GET(cmap1_ptep) = TLBLO_PA_TO_PFN(phys) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; return (PCPU_GET(cmap1_addr)); } static __inline vm_offset_t pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) { critical_enter(); *PCPU_GET(cmap1_ptep) = TLBLO_PA_TO_PFN(phys1) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; *PCPU_GET(cmap2_ptep) = TLBLO_PA_TO_PFN(phys2) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; return (PCPU_GET(cmap1_addr)); } static __inline void pmap_lmem_unmap(void) { *PCPU_GET(cmap1_ptep) = PTE_G; tlb_invalidate_address(kernel_pmap, PCPU_GET(cmap1_addr)); if (*PCPU_GET(cmap2_ptep) != PTE_G) { *PCPU_GET(cmap2_ptep) = PTE_G; tlb_invalidate_address(kernel_pmap, PCPU_GET(cmap2_addr)); } critical_exit(); } #else /* __mips_n64 */ static __inline void pmap_alloc_lmem_map(void) { } static __inline vm_offset_t pmap_lmem_map1(vm_paddr_t phys) { return (0); } static __inline vm_offset_t pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) { return (0); } static __inline vm_offset_t pmap_lmem_unmap(void) { return (0); } #endif /* !__mips_n64 */ static __inline int pmap_pte_cache_bits(vm_paddr_t pa, vm_page_t m) { vm_memattr_t ma; ma = pmap_page_get_memattr(m); if (ma == VM_MEMATTR_WRITE_BACK && !is_cacheable_mem(pa)) ma = VM_MEMATTR_UNCACHEABLE; return PTE_C(ma); } #define PMAP_PTE_SET_CACHE_BITS(pte, pa, m) { \ pte &= ~PTE_C_MASK; \ pte |= pmap_pte_cache_bits(pa, m); \ } /* * Page table entry lookup routines. */ static __inline pd_entry_t * pmap_segmap(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_segtab[pmap_seg_index(va)]); } #ifdef __mips_n64 static __inline pd_entry_t * pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)*pdpe; return (&pde[pmap_pde_index(va)]); } static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pd_entry_t *pdpe; pdpe = pmap_segmap(pmap, va); if (*pdpe == NULL) return (NULL); return (pmap_pdpe_to_pde(pdpe, va)); } #else static __inline pd_entry_t * pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) { return (pdpe); } static __inline pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va) { return (pmap_segmap(pmap, va)); } #endif static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)*pde; return (&pte[pmap_pte_index(va)]); } pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pde = pmap_pde(pmap, va); if (pde == NULL || *pde == NULL) return (NULL); return (pmap_pde_to_pte(pde, va)); } vm_offset_t pmap_steal_memory(vm_size_t size) { vm_paddr_t bank_size, pa; vm_offset_t va; size = round_page(size); bank_size = phys_avail[1] - phys_avail[0]; while (size > bank_size) { int i; for (i = 0; phys_avail[i + 2]; i += 2) { phys_avail[i] = phys_avail[i + 2]; phys_avail[i + 1] = phys_avail[i + 3]; } phys_avail[i] = 0; phys_avail[i + 1] = 0; if (!phys_avail[0]) panic("pmap_steal_memory: out of memory"); bank_size = phys_avail[1] - phys_avail[0]; } pa = phys_avail[0]; phys_avail[0] += size; if (MIPS_DIRECT_MAPPABLE(pa) == 0) panic("Out of memory below 512Meg?"); va = MIPS_PHYS_TO_DIRECT(pa); bzero((caddr_t)va, size); return (va); } /* * Bootstrap the system enough to run with virtual memory. This * assumes that the phys_avail array has been initialized. */ static void pmap_create_kernel_pagetable(void) { int i, j; vm_offset_t ptaddr; pt_entry_t *pte; #ifdef __mips_n64 pd_entry_t *pde; vm_offset_t pdaddr; int npt, npde; #endif /* * Allocate segment table for the kernel */ kernel_segmap = (pd_entry_t *)pmap_steal_memory(PAGE_SIZE); /* * Allocate second level page tables for the kernel */ #ifdef __mips_n64 npde = howmany(NKPT, NPDEPG); pdaddr = pmap_steal_memory(PAGE_SIZE * npde); #endif nkpt = NKPT; ptaddr = pmap_steal_memory(PAGE_SIZE * nkpt); /* * The R[4-7]?00 stores only one copy of the Global bit in the * translation lookaside buffer for each 2 page entry. Thus invalid * entrys must have the Global bit set so when Entry LO and Entry HI * G bits are anded together they will produce a global bit to store * in the tlb. */ for (i = 0, pte = (pt_entry_t *)ptaddr; i < (nkpt * NPTEPG); i++, pte++) *pte = PTE_G; #ifdef __mips_n64 for (i = 0, npt = nkpt; npt > 0; i++) { kernel_segmap[i] = (pd_entry_t)(pdaddr + i * PAGE_SIZE); pde = (pd_entry_t *)kernel_segmap[i]; for (j = 0; j < NPDEPG && npt > 0; j++, npt--) pde[j] = (pd_entry_t)(ptaddr + (i * NPDEPG + j) * PAGE_SIZE); } #else for (i = 0, j = pmap_seg_index(VM_MIN_KERNEL_ADDRESS); i < nkpt; i++, j++) kernel_segmap[j] = (pd_entry_t)(ptaddr + (i * PAGE_SIZE)); #endif PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_segtab = kernel_segmap; CPU_FILL(&kernel_pmap->pm_active); TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_asid[0].asid = PMAP_ASID_RESERVED; kernel_pmap->pm_asid[0].gen = 0; kernel_vm_end += nkpt * NPTEPG * PAGE_SIZE; } void pmap_bootstrap(void) { int i; /* Sort. */ again: for (i = 0; phys_avail[i + 1] != 0; i += 2) { /* * Keep the memory aligned on page boundary. */ phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); if (i < 2) continue; if (phys_avail[i - 2] > phys_avail[i]) { vm_paddr_t ptemp[2]; ptemp[0] = phys_avail[i + 0]; ptemp[1] = phys_avail[i + 1]; phys_avail[i + 0] = phys_avail[i - 2]; phys_avail[i + 1] = phys_avail[i - 1]; phys_avail[i - 2] = ptemp[0]; phys_avail[i - 1] = ptemp[1]; goto again; } } /* * In 32 bit, we may have memory which cannot be mapped directly. * This memory will need temporary mapping before it can be * accessed. */ if (!MIPS_DIRECT_MAPPABLE(phys_avail[i - 1] - 1)) need_local_mappings = 1; /* * Copy the phys_avail[] array before we start stealing memory from it. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { physmem_desc[i] = phys_avail[i]; physmem_desc[i + 1] = phys_avail[i + 1]; } Maxmem = atop(phys_avail[i - 1]); if (bootverbose) { printf("Physical memory chunk(s):\n"); for (i = 0; phys_avail[i + 1] != 0; i += 2) { vm_paddr_t size; size = phys_avail[i + 1] - phys_avail[i]; printf("%#08jx - %#08jx, %ju bytes (%ju pages)\n", (uintmax_t) phys_avail[i], (uintmax_t) phys_avail[i + 1] - 1, (uintmax_t) size, (uintmax_t) size / PAGE_SIZE); } printf("Maxmem is 0x%0jx\n", ptoa((uintmax_t)Maxmem)); } /* * Steal the message buffer from the beginning of memory. */ msgbufp = (struct msgbuf *)pmap_steal_memory(msgbufsize); msgbufinit(msgbufp, msgbufsize); /* * Steal thread0 kstack. */ kstack0 = pmap_steal_memory(KSTACK_PAGES << PAGE_SHIFT); virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_KERNEL_ADDRESS; #ifdef SMP /* * Steal some virtual address space to map the pcpu area. */ virtual_avail = roundup2(virtual_avail, PAGE_SIZE * 2); pcpup = (struct pcpu *)virtual_avail; virtual_avail += PAGE_SIZE * 2; /* * Initialize the wired TLB entry mapping the pcpu region for * the BSP at 'pcpup'. Up until this point we were operating * with the 'pcpup' for the BSP pointing to a virtual address * in KSEG0 so there was no need for a TLB mapping. */ mips_pcpu_tlb_init(PCPU_ADDR(0)); if (bootverbose) printf("pcpu is available at virtual address %p.\n", pcpup); #endif pmap_create_kernel_pagetable(); if (need_local_mappings) pmap_alloc_lmem_map(); pmap_max_asid = VMNUM_PIDS; mips_wr_entryhi(0); mips_wr_pagemask(0); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_flags = VM_MEMATTR_DEFAULT << PV_MEMATTR_SHIFT; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { } /*************************************************** * Low level helper routines..... ***************************************************/ #ifdef SMP static __inline void pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) { int cpuid, cpu, self; cpuset_t active_cpus; sched_pin(); if (is_kernel_pmap(pmap)) { smp_rendezvous(NULL, fn, NULL, arg); goto out; } /* Force ASID update on inactive CPUs */ CPU_FOREACH(cpu) { if (!CPU_ISSET(cpu, &pmap->pm_active)) pmap->pm_asid[cpu].gen = 0; } cpuid = PCPU_GET(cpuid); /* * XXX: barrier/locking for active? * * Take a snapshot of active here, any further changes are ignored. * tlb update/invalidate should be harmless on inactive CPUs */ active_cpus = pmap->pm_active; self = CPU_ISSET(cpuid, &active_cpus); CPU_CLR(cpuid, &active_cpus); /* Optimize for the case where this cpu is the only active one */ if (CPU_EMPTY(&active_cpus)) { if (self) fn(arg); } else { if (self) CPU_SET(cpuid, &active_cpus); smp_rendezvous_cpus(active_cpus, NULL, fn, NULL, arg); } out: sched_unpin(); } #else /* !SMP */ static __inline void pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) { int cpuid; if (is_kernel_pmap(pmap)) { fn(arg); return; } cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &pmap->pm_active)) pmap->pm_asid[cpuid].gen = 0; else fn(arg); } #endif /* SMP */ static void pmap_invalidate_all(pmap_t pmap) { pmap_call_on_active_cpus(pmap, (void (*)(void *))tlb_invalidate_all_user, pmap); } struct pmap_invalidate_page_arg { pmap_t pmap; vm_offset_t va; }; static void pmap_invalidate_page_action(void *arg) { struct pmap_invalidate_page_arg *p = arg; tlb_invalidate_address(p->pmap, p->va); } static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct pmap_invalidate_page_arg arg; arg.pmap = pmap; arg.va = va; pmap_call_on_active_cpus(pmap, pmap_invalidate_page_action, &arg); } struct pmap_invalidate_range_arg { pmap_t pmap; vm_offset_t sva; vm_offset_t eva; }; static void pmap_invalidate_range_action(void *arg) { struct pmap_invalidate_range_arg *p = arg; tlb_invalidate_range(p->pmap, p->sva, p->eva); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct pmap_invalidate_range_arg arg; arg.pmap = pmap; arg.sva = sva; arg.eva = eva; pmap_call_on_active_cpus(pmap, pmap_invalidate_range_action, &arg); } struct pmap_update_page_arg { pmap_t pmap; vm_offset_t va; pt_entry_t pte; }; static void pmap_update_page_action(void *arg) { struct pmap_update_page_arg *p = arg; tlb_update(p->pmap, p->va, p->pte); } static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte) { struct pmap_update_page_arg arg; arg.pmap = pmap; arg.va = va; arg.pte = pte; pmap_call_on_active_cpus(pmap, pmap_update_page_action, &arg); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; vm_offset_t retval = 0; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (pte) { retval = TLBLO_PTE_TO_PA(*pte) | (va & PAGE_MASK); } PMAP_UNLOCK(pmap); return (retval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t pte, *ptep; vm_paddr_t pa; vm_page_t m; m = NULL; PMAP_LOCK(pmap); ptep = pmap_pte(pmap, va); if (ptep != NULL) { pte = *ptep; if (pte_test(&pte, PTE_V) && (!pte_test(&pte, PTE_RO) || (prot & VM_PROT_WRITE) == 0)) { pa = TLBLO_PTE_TO_PA(pte); m = PHYS_TO_VM_PAGE(pa); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * add a wired page to the kva */ void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { pt_entry_t *pte; pt_entry_t opte, npte; #ifdef PMAP_DEBUG printf("pmap_kenter: va: %p -> pa: %p\n", (void *)va, (void *)pa); #endif pte = pmap_pte(kernel_pmap, va); opte = *pte; npte = TLBLO_PA_TO_PFN(pa) | PTE_C(ma) | PTE_D | PTE_V | PTE_G; *pte = npte; if (pte_test(&opte, PTE_V) && opte != npte) pmap_update_page(kernel_pmap, va, npte); } void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { KASSERT(is_cacheable_mem(pa), ("pmap_kenter: memory at 0x%lx is not cacheable", (u_long)pa)); pmap_kenter_attr(va, pa, VM_MEMATTR_DEFAULT); } void pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) { KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); for (; size > 0; size -= PAGE_SIZE) { /* * XXXCEM: this is somewhat inefficient on SMP systems in that * every single page is individually TLB-invalidated via * rendezvous (pmap_update_page()), instead of invalidating the * entire range via a single rendezvous. */ pmap_kenter_attr(va, pa, VM_MEMATTR_UNCACHEABLE); va += PAGE_SIZE; pa += PAGE_SIZE; } } void pmap_kremove_device(vm_offset_t va, vm_size_t size) { KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); /* * XXXCEM: Similar to pmap_kenter_device, this is inefficient on SMP, * in that pages are invalidated individually instead of a single range * rendezvous. */ for (; size > 0; size -= PAGE_SIZE) { pmap_kremove(va); va += PAGE_SIZE; } } /* * remove a page from the kernel pagetables */ /* PMAP_INLINE */ void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; /* * Write back all caches from the page being destroyed */ mips_dcache_wbinv_range_index(va, PAGE_SIZE); pte = pmap_pte(kernel_pmap, va); *pte = PTE_G; pmap_invalidate_page(kernel_pmap, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; if (MIPS_DIRECT_MAPPABLE(end - 1)) return (MIPS_PHYS_TO_DIRECT(start)); va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; vm_offset_t origva = va; for (i = 0; i < count; i++) { pmap_flush_pvcache(m[i]); pmap_kenter(va, VM_PAGE_TO_PHYS(m[i])); va += PAGE_SIZE; } mips_dcache_wbinv_range_index(origva, PAGE_SIZE*count); } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t va, int count) { pt_entry_t *pte; vm_offset_t origva; if (count < 1) return; mips_dcache_wbinv_range_index(va, PAGE_SIZE * count); origva = va; do { pte = pmap_pte(kernel_pmap, va); *pte = PTE_G; va += PAGE_SIZE; } while (--count > 0); pmap_invalidate_range(kernel_pmap, origva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static PMAP_INLINE boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { pd_entry_t *pde; vm_offset_t sva, eva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ #ifdef __mips_n64 if (m->pindex < NUPDE) { pde = pmap_pde(pmap, va); sva = va & ~PDRMASK; eva = sva + NBPDR; } else { pde = pmap_segmap(pmap, va); sva = va & ~SEGMASK; eva = sva + NBSEG; } #else pde = pmap_pde(pmap, va); sva = va & ~SEGMASK; eva = sva + NBSEG; #endif *pde = 0; pmap->pm_stats.resident_count--; #ifdef __mips_n64 if (m->pindex < NUPDE) { pd_entry_t *pdp; vm_page_t pdpg; /* * Recursively decrement next level pagetable refcount. * Either that shoots down a larger range from TLBs (below) * or we're to shoot down just the page in question. */ pdp = (pd_entry_t *)*pmap_segmap(pmap, va); pdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pdp)); if (!pmap_unwire_ptp(pmap, va, pdpg)) { pmap_invalidate_range(pmap, sva, eva); } } else { /* Segmap entry shootdown */ pmap_invalidate_range(pmap, sva, eva); } #else /* Segmap entry shootdown */ pmap_invalidate_range(pmap, sva, eva); #endif /* * If the page is finally unwired, simply free it. */ vm_page_free_zero(m); vm_wire_sub(1); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(pde != 0, ("pmap_unuse_pt: pde != 0")); mpte = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pde)); return (pmap_unwire_ptp(pmap, va, mpte)); } void pmap_pinit0(pmap_t pmap) { int i; PMAP_LOCK_INIT(pmap); pmap->pm_segtab = kernel_segmap; CPU_ZERO(&pmap->pm_active); for (i = 0; i < MAXCPU; i++) { pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; pmap->pm_asid[i].gen = 0; } PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } static void pmap_grow_direct_page(int req) { #ifdef __mips_n64 vm_wait(NULL); #else if (!vm_page_reclaim_contig(req, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) vm_wait(NULL); #endif } static vm_page_t pmap_alloc_direct_page(unsigned int index, int req) { vm_page_t m; m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) return (NULL); - - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(m); - m->pindex = index; return (m); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_offset_t ptdva; vm_page_t ptdpg; int i, req_class; /* * allocate the page directory page */ req_class = VM_ALLOC_NORMAL; while ((ptdpg = pmap_alloc_direct_page(NUSERPGTBLS, req_class)) == NULL) pmap_grow_direct_page(req_class); ptdva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(ptdpg)); pmap->pm_segtab = (pd_entry_t *)ptdva; CPU_ZERO(&pmap->pm_active); for (i = 0; i < MAXCPU; i++) { pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; pmap->pm_asid[i].gen = 0; } TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags) { vm_offset_t pageva; vm_page_t m; int req_class; /* * Find or fabricate a new pagetable page */ req_class = VM_ALLOC_NORMAL; if ((m = pmap_alloc_direct_page(ptepindex, req_class)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); pmap_grow_direct_page(req_class); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page * table page may have been allocated. */ return (NULL); } /* * Map the pagetable page into the process address space, if it * isn't already there. */ pageva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); #ifdef __mips_n64 if (ptepindex >= NUPDE) { pmap->pm_segtab[ptepindex - NUPDE] = (pd_entry_t)pageva; } else { pd_entry_t *pdep, *pde; int segindex = ptepindex >> (SEGSHIFT - PDRSHIFT); int pdeindex = ptepindex & (NPDEPG - 1); vm_page_t pg; pdep = &pmap->pm_segtab[segindex]; if (*pdep == NULL) { /* recurse for allocating page dir */ if (_pmap_allocpte(pmap, NUPDE + segindex, flags) == NULL) { /* alloc failed, release current */ vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { pg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pdep)); pg->ref_count++; } /* Next level entry */ pde = (pd_entry_t *)*pdep; KASSERT(pde[pdeindex] == 0, ("%s: PTE %p is valid", __func__, pde[pdeindex])); pde[pdeindex] = (pd_entry_t)pageva; } #else KASSERT(pmap->pm_segtab[ptepindex] == 0, ("%s: PTE %p is valid", __func__, pmap->pm_segtab[ptepindex])); pmap->pm_segtab[ptepindex] = (pd_entry_t)pageva; #endif pmap->pm_stats.resident_count++; return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { unsigned ptepindex; pd_entry_t *pde; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment the hold * count, and activate it. */ if (pde != NULL && *pde != NULL) { m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pde)); m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_offset_t ptdva; vm_page_t ptdpg; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); ptdva = (vm_offset_t)pmap->pm_segtab; ptdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(ptdva)); vm_page_unwire_noq(ptdpg); vm_page_free_zero(ptdpg); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_page_t nkpg; pd_entry_t *pde, *pdpe; pt_entry_t *pte; int i, req_class; mtx_assert(&kernel_map->system_mtx, MA_OWNED); req_class = VM_ALLOC_INTERRUPT; addr = roundup2(addr, NBSEG); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { pdpe = pmap_segmap(kernel_pmap, kernel_vm_end); #ifdef __mips_n64 if (*pdpe == 0) { /* new intermediate page table entry */ nkpg = pmap_alloc_direct_page(nkpt, req_class); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); *pdpe = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); continue; /* try again */ } #endif pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if (*pde != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } /* * This index is bogus, but out of the way */ nkpg = pmap_alloc_direct_page(nkpt, req_class); #ifndef __mips_n64 if (nkpg == NULL && vm_page_reclaim_contig(req_class, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) nkpg = pmap_alloc_direct_page(nkpt, req_class); #endif if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; *pde = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); /* * The R[4-7]?00 stores only one copy of the Global bit in * the translation lookaside buffer for each 2 page entry. * Thus invalid entrys must have the Global bit set so when * Entry LO and Entry HI G bits are anded together they will * produce a global bit to store in the tlb. */ pte = (pt_entry_t *)*pde; for (i = 0; i < NPTEPG; i++) pte[i] = PTE_G; kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); #ifdef __mips_n64 CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); #else CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); #endif static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #ifdef __mips_n64 #define PC_FREE0_1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful #else #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ #endif static const u_long pc_freemask[_NPCM] = { #ifdef __mips_n64 PC_FREE0_1, PC_FREE0_1, PC_FREE2 #else PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 #endif }; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, oldpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; u_long inuse; int bit, field, freed, idx; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; idx = field * sizeof(inuse) * NBBY + bit; pv = &pc->pc_pventry[idx]; va = pv->pv_va; pde = pmap_pde(pmap, va); KASSERT(pde != NULL && *pde != 0, ("pmap_pv_reclaim: pde")); pte = pmap_pde_to_pte(pde, va); oldpte = *pte; if (pte_test(&oldpte, PTE_W)) continue; if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(oldpte)); if (pte_test(&oldpte, PTE_D)) vm_page_dirty(m); if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pc->pc_map[field] |= 1UL << bit; /* * For simplicity, we will unconditionally shoot * down TLBs either at the end of this function * or at the top of the loop above if we switch * to a different pmap. */ (void)pmap_unuse_pt(pmap, va, *pde); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS( (vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int bit, field, idx; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / (sizeof(u_long) * NBBY); bit = idx % (sizeof(u_long) * NBBY); pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { struct pv_chunk *pc; pv_entry_t pv; vm_page_t m; int bit, field, idx; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { idx = field * sizeof(pc->pc_map[field]) * NBBY + bit; pv = &pc->pc_pventry[idx]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, VM_ALLOC_NORMAL | VM_ALLOC_WIRED); if (m == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); dump_add_page(m->phys_addr); pc = (struct pv_chunk *)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); break; } } return (pv); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found, pa %lx va %lx", (u_long)VM_PAGE_TO_PHYS(__containerof(pvh, struct vm_page, md)), (u_long)va)); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); return (TRUE); } else return (FALSE); } /* * pmap_remove_pte: do the things to unmap a page in a process * * Returns true if this was the last PTE in the PT (and possibly the last PT in * the PD, and possibly the last PD in the segmap), in which case... * * 1) the TLB has been invalidated for the whole PT's span (at least), * already, to ensure that MipsDoTLBMiss does not attempt to follow a * dangling pointer into a freed page. No additional TLB shootdown is * required. * * 2) if this removal was part of a sweep to remove PTEs, it is safe to jump * to the PT span boundary and continue. * * 3) The given pde may now point onto a freed page and must not be * dereferenced * * If the return value is false, the TLB has not been shot down (and the segmap * entry, PD, and PT all remain in place). */ static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t pde) { pt_entry_t oldpte; vm_page_t m; vm_paddr_t pa; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Write back all cache lines from the page being unmapped. */ mips_dcache_wbinv_range_index(va, PAGE_SIZE); oldpte = *ptq; if (is_kernel_pmap(pmap)) *ptq = PTE_G; else *ptq = 0; if (pte_test(&oldpte, PTE_W)) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pte_test(&oldpte, PTE_MANAGED)) { pa = TLBLO_PTE_TO_PA(oldpte); m = PHYS_TO_VM_PAGE(pa); if (pte_test(&oldpte, PTE_D)) { KASSERT(!pte_test(&oldpte, PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)va, (uintmax_t)oldpte)); vm_page_dirty(m); } if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, pde)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(struct pmap *pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t *ptq; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (pde == NULL || *pde == 0) return; ptq = pmap_pde_to_pte(pde, va); /* * If there is no pte for this address, just skip it! */ if (!pte_test(ptq, PTE_V)) return; /* * Remove this PTE from the PT. If this is the last one, then * the TLB has already been shot down, so don't bother again */ if (!pmap_remove_pte(pmap, ptq, va, *pde)) pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va_next; vm_offset_t va_init, va_fini; bool need_tlb_shootdown; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); /* * special handling of removing one page. a very common operation * and easy to short circuit some code. */ if ((sva + PAGE_SIZE) == eva) { pmap_remove_page(pmap, sva); goto out; } for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif /* Scan up to the end of the page table pointed to by pde */ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; need_tlb_shootdown = false; va_init = sva; va_fini = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { /* Skip over invalid entries; no need to shootdown */ if (!pte_test(pte, PTE_V)) { /* * If we have not yet found a valid entry, then * we can move the lower edge of the region to * invalidate to the next PTE. */ if (!need_tlb_shootdown) va_init = sva + PAGE_SIZE; continue; } /* * A valid entry; the range we are shooting down must * include this page. va_fini is used instead of sva * so that if the range ends with a run of !PTE_V PTEs, * but doesn't clear out so much that pmap_remove_pte * removes the entire PT, we won't include these !PTE_V * entries in the region to be shot down. */ va_fini = sva + PAGE_SIZE; if (pmap_remove_pte(pmap, pte, sva, *pde)) { /* Entire PT removed and TLBs shot down. */ need_tlb_shootdown = false; break; } else { need_tlb_shootdown = true; } } if (need_tlb_shootdown) pmap_invalidate_range(pmap, va_init, va_fini); } out: rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte, tpte; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); rw_wlock(&pvh_global_lock); if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); /* * If it's last mapping writeback all caches from * the page being destroyed */ if (TAILQ_NEXT(pv, pv_list) == NULL) mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT(pde != NULL && *pde != 0, ("pmap_remove_all: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = *pte; if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; if (pte_test(&tpte, PTE_W)) pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ if (pte_test(&tpte, PTE_D)) { KASSERT(!pte_test(&tpte, PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)pv->pv_va, (uintmax_t)tpte)); vm_page_dirty(m); } if (!pmap_unuse_pt(pmap, pv->pv_va, *pde)) pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); m->md.pv_flags &= ~PV_TABLE_REF; rw_wunlock(&pvh_global_lock); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pt_entry_t pbits, *pte; pd_entry_t *pde, *pdpe; vm_offset_t va, va_next; vm_paddr_t pa; vm_page_t m; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being write protected. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pbits = *pte; if (!pte_test(&pbits, PTE_V) || pte_test(&pbits, PTE_RO)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } pte_set(&pbits, PTE_RO); if (pte_test(&pbits, PTE_D)) { pte_clear(&pbits, PTE_D); if (pte_test(&pbits, PTE_MANAGED)) { pa = TLBLO_PTE_TO_PA(pbits); m = PHYS_TO_VM_PAGE(pa); vm_page_dirty(m); } if (va == va_next) va = sva; } else { /* * Unless PTE_D is set, any TLB entries * mapping "sva" don't allow write access, so * they needn't be invalidated. */ if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } *pte = pbits; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { vm_paddr_t pa, opa; pt_entry_t *pte; pt_entry_t origpte, newpte; pv_entry_t pv; vm_page_t mpte, om; va &= ~PAGE_MASK; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); pa = VM_PAGE_TO_PHYS(m); newpte = TLBLO_PA_TO_PFN(pa) | init_pte_prot(m, flags, prot); if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PTE_W; if (is_kernel_pmap(pmap)) newpte |= PTE_G; PMAP_PTE_SET_CACHE_BITS(newpte, pa, m); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PTE_MANAGED; mpte = NULL; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); /* * In the case that a page table page is not resident, we are * creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=%p", (void *)pmap->pm_segtab, (void *)va); } origpte = *pte; KASSERT(!pte_test(&origpte, PTE_D | PTE_RO | PTE_V), ("pmap_enter: modified page not writable: va: %p, pte: %#jx", (void *)va, (uintmax_t)origpte)); opa = TLBLO_PTE_TO_PA(origpte); /* * Mapping has not changed, must be protection or wiring change. */ if (pte_test(&origpte, PTE_V) && opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is * wired, the PT page will be also. */ if (pte_test(&newpte, PTE_W) && !pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count++; else if (!pte_test(&newpte, PTE_W) && pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->ref_count--; if (pte_test(&origpte, PTE_MANAGED)) { m->md.pv_flags |= PV_TABLE_REF; if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; if (pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count--; if (pte_test(&origpte, PTE_MANAGED)) { om = PHYS_TO_VM_PAGE(opa); if (pte_test(&origpte, PTE_D)) vm_page_dirty(om); if ((om->md.pv_flags & PV_TABLE_REF) != 0) { om->md.pv_flags &= ~PV_TABLE_REF; vm_page_aflag_set(om, PGA_REFERENCED); } pv = pmap_pvh_remove(&om->md, pmap, va); if (!pte_test(&newpte, PTE_MANAGED)) free_pv_entry(pmap, pv); if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list)) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); origpte = 0; if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: %p", (void *)va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if (pte_test(&newpte, PTE_MANAGED)) { m->md.pv_flags |= PV_TABLE_REF; if (pv == NULL) { pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; } TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Increment counters */ if (pte_test(&newpte, PTE_W)) pmap->pm_stats.wired_count++; validate: #ifdef PMAP_DEBUG printf("pmap_enter: va: %p -> pa: %p\n", (void *)va, (void *)pa); #endif /* * if the mapping or permission bits are different, we need to * update the pte. */ if (origpte != newpte) { *pte = newpte; if (pte_test(&origpte, PTE_V)) { KASSERT(opa == pa, ("pmap_enter: invalid update")); if (pte_test(&origpte, PTE_D)) { if (pte_test(&origpte, PTE_MANAGED)) vm_page_dirty(m); } pmap_update_page(pmap, va, newpte); } } /* * Sync I & D caches for executable pages. Do this only if the * target pmap belongs to the current process. Otherwise, an * unresolvable TLB miss may occur. */ if (!is_kernel_pmap(pmap) && (pmap == &curproc->p_vmspace->vm_pmap) && (prot & VM_PROT_EXECUTE)) { mips_icache_sync_range(va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte, npte; vm_paddr_t pa; KASSERT(!VA_IS_CLEANMAP(va) || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not resident, we are * creating it here. */ if (va < VM_MAXUSER_ADDRESS) { pd_entry_t *pde; unsigned ptepindex; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->ref_count++; } else { /* * Get the page directory entry */ pde = pmap_pde(pmap, va); /* * If the page table page is mapped, we just * increment the hold count, and activate it. */ if (pde && *pde != 0) { mpte = PHYS_TO_VM_PAGE( MIPS_DIRECT_TO_PHYS(*pde)); mpte->ref_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } pte = pmap_pte(pmap, va); if (pte_test(pte, PTE_V)) { if (mpte != NULL) { mpte->ref_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, mpte, va, m)) { if (mpte != NULL) { pmap_unwire_ptp(pmap, va, mpte); mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ npte = PTE_RO | TLBLO_PA_TO_PFN(pa) | PTE_V; if ((m->oflags & VPO_UNMANAGED) == 0) npte |= PTE_MANAGED; PMAP_PTE_SET_CACHE_BITS(npte, pa, m); if (is_kernel_pmap(pmap)) *pte = npte | PTE_G; else { *pte = npte; /* * Sync I & D caches. Do this only if the target pmap * belongs to the current process. Otherwise, an * unresolvable TLB miss may occur. */ if (pmap == &curproc->p_vmspace->vm_pmap) { va &= ~PAGE_MASK; mips_icache_sync_range(va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } } return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; if (i != 0) printf("%s: ERROR!!! More than one page of virtual address mapping not supported\n", __func__); if (MIPS_DIRECT_MAPPABLE(pa)) { va = MIPS_PHYS_TO_DIRECT(pa); } else { #ifndef __mips_n64 /* XXX : to be converted to new style */ pt_entry_t *pte, npte; pte = pmap_pte(kernel_pmap, crashdumpva); /* Since this is for the debugger, no locks or any other fun */ npte = TLBLO_PA_TO_PFN(pa) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; *pte = npte; pmap_update_page(kernel_pmap, crashdumpva, npte); va = crashdumpva; #endif } return ((void *)va); } void pmap_kenter_temporary_free(vm_paddr_t pa) { #ifndef __mips_n64 /* XXX : to be converted to new style */ pt_entry_t *pte; #endif if (MIPS_DIRECT_MAPPABLE(pa)) { /* nothing to do for this case */ return; } #ifndef __mips_n64 /* XXX : to be converted to new style */ pte = pmap_pte(kernel_pmap, crashdumpva); *pte = PTE_G; pmap_invalidate_page(kernel_pmap, crashdumpva); #endif } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va_next; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == NULL) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V)) continue; if (!pte_test(pte, PTE_W)) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); pte_clear(pte, PTE_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } else { va = pmap_lmem_map1(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); pmap_lmem_unmap(); } } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((char *)(caddr_t)va + off, size); mips_dcache_wbinv_range(va + off, size); } else { va = pmap_lmem_map1(phys); bzero((char *)va + off, size); mips_dcache_wbinv_range(va + off, size); pmap_lmem_unmap(); } } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { vm_offset_t va_src, va_dst; vm_paddr_t phys_src = VM_PAGE_TO_PHYS(src); vm_paddr_t phys_dst = VM_PAGE_TO_PHYS(dst); if (MIPS_DIRECT_MAPPABLE(phys_src) && MIPS_DIRECT_MAPPABLE(phys_dst)) { /* easy case, all can be accessed via KSEG0 */ /* * Flush all caches for VA that are mapped to this page * to make sure that data in SDRAM is up to date */ pmap_flush_pvcache(src); mips_dcache_wbinv_range_index( MIPS_PHYS_TO_DIRECT(phys_dst), PAGE_SIZE); va_src = MIPS_PHYS_TO_DIRECT(phys_src); va_dst = MIPS_PHYS_TO_DIRECT(phys_dst); bcopy((caddr_t)va_src, (caddr_t)va_dst, PAGE_SIZE); mips_dcache_wbinv_range(va_dst, PAGE_SIZE); } else { va_src = pmap_lmem_map2(phys_src, phys_dst); va_dst = va_src + PAGE_SIZE; bcopy((void *)va_src, (void *)va_dst, PAGE_SIZE); mips_dcache_wbinv_range(va_dst, PAGE_SIZE); pmap_lmem_unmap(); } } int unmapped_buf_allowed; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { char *a_cp, *b_cp; vm_page_t a_m, b_m; vm_offset_t a_pg_offset, b_pg_offset; vm_paddr_t a_phys, b_phys; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_m = ma[a_offset >> PAGE_SHIFT]; a_phys = VM_PAGE_TO_PHYS(a_m); b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_m = mb[b_offset >> PAGE_SHIFT]; b_phys = VM_PAGE_TO_PHYS(b_m); if (MIPS_DIRECT_MAPPABLE(a_phys) && MIPS_DIRECT_MAPPABLE(b_phys)) { pmap_flush_pvcache(a_m); mips_dcache_wbinv_range_index( MIPS_PHYS_TO_DIRECT(b_phys), PAGE_SIZE); a_cp = (char *)MIPS_PHYS_TO_DIRECT(a_phys) + a_pg_offset; b_cp = (char *)MIPS_PHYS_TO_DIRECT(b_phys) + b_pg_offset; bcopy(a_cp, b_cp, cnt); mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); } else { a_cp = (char *)pmap_lmem_map2(a_phys, b_phys); b_cp = (char *)a_cp + PAGE_SIZE; a_cp += a_pg_offset; b_cp += b_pg_offset; bcopy(a_cp, b_cp, cnt); mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); pmap_lmem_unmap(); } a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { #if defined(__mips_n64) return MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); #else vm_offset_t qaddr; vm_paddr_t pa; pt_entry_t *pte, npte; pa = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(pa)) { if (pmap_page_get_memattr(m) != VM_MEMATTR_WRITE_BACK) return (MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); else return (MIPS_PHYS_TO_DIRECT(pa)); } critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = PCPU_GET(qmap_ptep); KASSERT(*pte == PTE_G, ("pmap_quick_enter_page: PTE busy")); npte = TLBLO_PA_TO_PFN(pa) | PTE_D | PTE_V | PTE_G; PMAP_PTE_SET_CACHE_BITS(npte, pa, m); *pte = npte; return (qaddr); #endif } void pmap_quick_remove_page(vm_offset_t addr) { mips_dcache_wbinv_range(addr, PAGE_SIZE); #if !defined(__mips_n64) pt_entry_t *pte; if (addr >= MIPS_KSEG0_START && addr < MIPS_KSEG0_END) return; pte = PCPU_GET(qmap_ptep); KASSERT(*pte != PTE_G, ("pmap_quick_remove_page: PTE not in use")); KASSERT(PCPU_GET(qmap_addr) == addr, ("pmap_quick_remove_page: invalid address")); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, addr); critical_exit(); #endif } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; pv_entry_t pv; vm_page_t m; struct pv_chunk *pc, *npc; u_long inuse, bitmask; int allfree, bit, field, idx; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * sizeof(inuse) * NBBY + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va); KASSERT(pde != NULL && *pde != 0, ("pmap_remove_pages: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); if (!pte_test(pte, PTE_V)) panic("pmap_remove_pages: bad pte"); tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (pte_test(&tpte, PTE_W)) { allfree = 0; continue; } *pte = is_kernel_pmap(pmap) ? PTE_G : 0; m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(tpte)); KASSERT(m != NULL, ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); /* * Update the vm_page_t clean and reference bits. */ if (pte_test(&tpte, PTE_D)) vm_page_dirty(m); /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); /* * For simplicity, unconditionally call * pmap_invalidate_all(), below. */ (void)pmap_unuse_pt(pmap, pv->pv_va, *pde); } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } /* * pmap_testbit tests bits in pte's */ static boolean_t pmap_testbit(vm_page_t m, int bit) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; boolean_t rv = FALSE; if (m->oflags & VPO_UNMANAGED) return (rv); rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); rv = pte_test(pte, bit); PMAP_UNLOCK(pmap); if (rv) break; } return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (pte_test(pte, PTE_W)) count++; PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); return (count); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { pmap_t pmap; pt_entry_t pbits, *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); KASSERT(pte != NULL && pte_test(pte, PTE_V), ("page on pv_list has no pte")); pbits = *pte; if (pte_test(&pbits, PTE_D)) { pte_clear(&pbits, PTE_D); vm_page_dirty(m); } pte_set(&pbits, PTE_RO); if (pbits != *pte) { *pte = pbits; pmap_update_page(pmap, pv->pv_va, pbits); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. */ int pmap_ts_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); if (m->md.pv_flags & PV_TABLE_REF) { rw_wlock(&pvh_global_lock); m->md.pv_flags &= ~PV_TABLE_REF; rw_wunlock(&pvh_global_lock); return (1); } return (0); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_testbit(m, PTE_D); rw_wunlock(&pvh_global_lock); return (rv); } /* N/C */ /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && *pde != 0) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte == 0); } PMAP_UNLOCK(pmap); return (rv); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va, va_next; vm_paddr_t pa; vm_page_t m; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being write protected. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_MANAGED | PTE_V)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } pa = TLBLO_PTE_TO_PA(*pte); m = PHYS_TO_VM_PAGE(pa); m->md.pv_flags &= ~PV_TABLE_REF; if (pte_test(pte, PTE_D)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ vm_page_dirty(m); } else { pte_clear(pte, PTE_D); if (va == va_next) va = sva; } } else { /* * Unless PTE_D is set, any TLB entries * mapping "sva" don't allow write access, so * they needn't be invalidated. */ if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (pte_test(pte, PTE_D)) { pte_clear(pte, PTE_D); pmap_update_page(pmap, pv->pv_va, *pte); } PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return ((m->md.pv_flags & PV_TABLE_REF) != 0); } /* * Miscellaneous support routines follow */ /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. * * Use XKPHYS uncached for 64 bit, and KSEG1 where possible for 32 bit. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, offset; /* * KSEG1 maps only first 512M of phys address space. For * pa > 0x20000000 we should make proper mapping * using pmap_kenter. */ if (MIPS_DIRECT_MAPPABLE(pa + size - 1) && ma == VM_MEMATTR_UNCACHEABLE) return ((void *)MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); else { offset = pa & PAGE_MASK; size = roundup(size + offset, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = trunc_page(pa); for (tmpva = va; size > 0;) { pmap_kenter_attr(tmpva, pa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } } return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return pmap_mapdev_attr(pa, size, VM_MEMATTR_UNCACHEABLE); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { #ifndef __mips_n64 vm_offset_t base, offset; /* If the address is within KSEG1 then there is nothing to do */ if (va >= MIPS_KSEG1_START && va <= MIPS_KSEG1_END) return; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(size + offset, PAGE_SIZE); pmap_qremove(base, atop(size)); kva_free(base, size); #endif } /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can * find other mappings. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { pt_entry_t *ptep, pte; vm_paddr_t pa; vm_page_t m; int val; PMAP_LOCK(pmap); ptep = pmap_pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; if (!pte_test(&pte, PTE_V)) { PMAP_UNLOCK(pmap); return (0); } val = MINCORE_INCORE; if (pte_test(&pte, PTE_D)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; pa = TLBLO_PTE_TO_PA(pte); if (pte_test(&pte, PTE_MANAGED)) { /* * This may falsely report the given address as * MINCORE_REFERENCED. Unfortunately, due to the lack of * per-PTE reference information, it is impossible to * determine if the address is MINCORE_REFERENCED. */ m = PHYS_TO_VM_PAGE(pa); if ((m->a.flags & PGA_REFERENCED) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && pte_test(&pte, PTE_MANAGED)) { *pap = pa; } PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; struct proc *p = td->td_proc; u_int cpuid; critical_enter(); pmap = vmspace_pmap(p->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); if (oldpmap) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); pmap_asid_alloc(pmap); if (td == curthread) { PCPU_SET(segbase, pmap->pm_segtab); mips_wr_entryhi(pmap->pm_asid[cpuid].asid); } PCPU_SET(curpmap, pmap); critical_exit(); } static void pmap_sync_icache_one(void *arg __unused) { mips_icache_sync_all(); mips_dcache_wbinv_all(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { smp_rendezvous(NULL, pmap_sync_icache_one, NULL, NULL); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < PDRSIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((PDRSIZE - superpage_offset) & PDRMASK) < PDRSIZE || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef DDB DB_SHOW_COMMAND(ptable, ddb_pid_dump) { pmap_t pmap; struct thread *td = NULL; struct proc *p; int i, j, k; vm_paddr_t pa; vm_offset_t va; if (have_addr) { td = db_lookup_thread(addr, true); if (td == NULL) { db_printf("Invalid pid or tid"); return; } p = td->td_proc; if (p->p_vmspace == NULL) { db_printf("No vmspace for process"); return; } pmap = vmspace_pmap(p->p_vmspace); } else pmap = kernel_pmap; db_printf("pmap:%p segtab:%p asid:%x generation:%x\n", pmap, pmap->pm_segtab, pmap->pm_asid[0].asid, pmap->pm_asid[0].gen); for (i = 0; i < NPDEPG; i++) { pd_entry_t *pdpe; pt_entry_t *pde; pt_entry_t pte; pdpe = (pd_entry_t *)pmap->pm_segtab[i]; if (pdpe == NULL) continue; db_printf("[%4d] %p\n", i, pdpe); #ifdef __mips_n64 for (j = 0; j < NPDEPG; j++) { pde = (pt_entry_t *)pdpe[j]; if (pde == NULL) continue; db_printf("\t[%4d] %p\n", j, pde); #else { j = 0; pde = (pt_entry_t *)pdpe; #endif for (k = 0; k < NPTEPG; k++) { pte = pde[k]; if (pte == 0 || !pte_test(&pte, PTE_V)) continue; pa = TLBLO_PTE_TO_PA(pte); va = ((u_long)i << SEGSHIFT) | (j << PDRSHIFT) | (k << PAGE_SHIFT); db_printf("\t\t[%04d] va: %p pte: %8jx pa:%jx\n", k, (void *)va, (uintmax_t)pte, (uintmax_t)pa); } } } } #endif /* * Allocate TLB address space tag (called ASID or TLBPID) and return it. * It takes almost as much or more time to search the TLB for a * specific ASID and flush those entries as it does to flush the entire TLB. * Therefore, when we allocate a new ASID, we just take the next number. When * we run out of numbers, we flush the TLB, increment the generation count * and start over. ASID zero is reserved for kernel use. */ static void pmap_asid_alloc(pmap) pmap_t pmap; { if (pmap->pm_asid[PCPU_GET(cpuid)].asid != PMAP_ASID_RESERVED && pmap->pm_asid[PCPU_GET(cpuid)].gen == PCPU_GET(asid_generation)); else { if (PCPU_GET(next_asid) == pmap_max_asid) { tlb_invalidate_all_user(NULL); PCPU_SET(asid_generation, (PCPU_GET(asid_generation) + 1) & ASIDGEN_MASK); if (PCPU_GET(asid_generation) == 0) { PCPU_SET(asid_generation, 1); } PCPU_SET(next_asid, 1); /* 0 means invalid */ } pmap->pm_asid[PCPU_GET(cpuid)].asid = PCPU_GET(next_asid); pmap->pm_asid[PCPU_GET(cpuid)].gen = PCPU_GET(asid_generation); PCPU_SET(next_asid, PCPU_GET(next_asid) + 1); } } static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot) { pt_entry_t rw; if (!(prot & VM_PROT_WRITE)) rw = PTE_V | PTE_RO; else if ((m->oflags & VPO_UNMANAGED) == 0) { if ((access & VM_PROT_WRITE) != 0) rw = PTE_V | PTE_D; else rw = PTE_V; } else /* Needn't emulate a modified bit for unmanaged pages. */ rw = PTE_V | PTE_D; return (rw); } /* * pmap_emulate_modified : do dirty bit emulation * * On SMP, update just the local TLB, other CPUs will update their * TLBs from PTE lazily, if they get the exception. * Returns 0 in case of sucess, 1 if the page is read only and we * need to fault. */ int pmap_emulate_modified(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); /* * It is possible that some other CPU or thread changed the pmap while * we weren't looking; in the SMP case, this is readily apparent, but * it can even happen in the UP case, because we may have been blocked * on PMAP_LOCK(pmap) above while someone changed this out from * underneath us. */ if (pte == NULL) { /* * This PTE's PTP (or one of its ancestors) has been reclaimed; * trigger a full fault to reconstruct it via pmap_enter. */ PMAP_UNLOCK(pmap); return (1); } if (!pte_test(pte, PTE_V)) { /* * This PTE is no longer valid; the other thread or other * processor must have arranged for our TLB to no longer * have this entry, possibly by IPI, so no tlb_update is * required. Fall out of the fast path and go take a * general fault before retrying the instruction (or taking * a signal). */ PMAP_UNLOCK(pmap); return (1); } if (pte_test(pte, PTE_D)) { /* * This PTE is valid and has the PTE_D bit asserted; since * this is an increase in permission, we may have been expected * to update the TLB lazily. Do so here and return, on the * fast path, to retry the instruction. */ tlb_update(pmap, va, *pte); PMAP_UNLOCK(pmap); return (0); } if (pte_test(pte, PTE_RO)) { /* * This PTE is valid, not dirty, and read-only. Go take a * full fault (most likely to upgrade this part of the address * space to writeable). */ PMAP_UNLOCK(pmap); return (1); } if (!pte_test(pte, PTE_MANAGED)) panic("pmap_emulate_modified: unmanaged page"); /* * PTE is valid, managed, not dirty, and not read-only. Set PTE_D * and eagerly update the local TLB, returning on the fast path. */ pte_set(pte, PTE_D); tlb_update(pmap, va, *pte); PMAP_UNLOCK(pmap); return (0); } /* * Routine: pmap_kextract * Function: * Extract the physical page address associated * virtual address. */ vm_paddr_t pmap_kextract(vm_offset_t va) { int mapped; /* * First, the direct-mapped regions. */ #if defined(__mips_n64) if (va >= MIPS_XKPHYS_START && va < MIPS_XKPHYS_END) return (MIPS_XKPHYS_TO_PHYS(va)); #endif if (va >= MIPS_KSEG0_START && va < MIPS_KSEG0_END) return (MIPS_KSEG0_TO_PHYS(va)); if (va >= MIPS_KSEG1_START && va < MIPS_KSEG1_END) return (MIPS_KSEG1_TO_PHYS(va)); /* * User virtual addresses. */ if (va < VM_MAXUSER_ADDRESS) { pt_entry_t *ptep; if (curproc && curproc->p_vmspace) { ptep = pmap_pte(&curproc->p_vmspace->vm_pmap, va); if (ptep) { return (TLBLO_PTE_TO_PA(*ptep) | (va & PAGE_MASK)); } return (0); } } /* * Should be kernel virtual here, otherwise fail */ mapped = (va >= MIPS_KSEG2_START || va < MIPS_KSEG2_END); #if defined(__mips_n64) mapped = mapped || (va >= MIPS_XKSEG_START || va < MIPS_XKSEG_END); #endif /* * Kernel virtual. */ if (mapped) { pt_entry_t *ptep; /* Is the kernel pmap initialized? */ if (!CPU_EMPTY(&kernel_pmap->pm_active)) { /* It's inside the virtual address range */ ptep = pmap_pte(kernel_pmap, va); if (ptep) { return (TLBLO_PTE_TO_PA(*ptep) | (va & PAGE_MASK)); } } return (0); } panic("%s for unknown address space %p.", __func__, (void *)va); } void pmap_flush_pvcache(vm_page_t m) { pv_entry_t pv; if (m != NULL) { for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); } } } void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { /* * It appears that this function can only be called before any mappings * for the page are established. If this ever changes, this code will * need to walk the pv_list and make each of the existing mappings * uncacheable, being careful to sync caches and PTEs (and maybe * invalidate TLB?) for any current mapping it modifies. */ if (TAILQ_FIRST(&m->md.pv_list) != NULL) panic("Can't change memattr on page with existing mappings"); /* Clean memattr portion of pv_flags */ m->md.pv_flags &= ~PV_MEMATTR_MASK; m->md.pv_flags |= (ma << PV_MEMATTR_SHIFT) & PV_MEMATTR_MASK; } static __inline void pmap_pte_attr(pt_entry_t *pte, vm_memattr_t ma) { u_int npte; npte = *(u_int *)pte; npte &= ~PTE_C_MASK; npte |= PTE_C(ma); *pte = npte; } int pmap_change_attr(vm_offset_t sva, vm_size_t size, vm_memattr_t ma) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t ova, eva, va, va_next; pmap_t pmap; ova = sva; eva = sva + size; if (eva < sva) return (EINVAL); pmap = kernel_pmap; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V) || pte_cache_bits(pte) == ma) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; pmap_pte_attr(pte, ma); } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); /* Flush caches to be in the safe side */ mips_dcache_wbinv_range(ova, size); return 0; } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { switch (mode) { case VM_MEMATTR_UNCACHEABLE: case VM_MEMATTR_WRITE_BACK: #ifdef MIPS_CCA_WC case VM_MEMATTR_WRITE_COMBINING: #endif return (TRUE); default: return (FALSE); } } diff --git a/sys/mips/mips/uma_machdep.c b/sys/mips/mips/uma_machdep.c index 5b57447edd6a..321804d634d5 100644 --- a/sys/mips/mips/uma_machdep.c +++ b/sys/mips/mips/uma_machdep.c @@ -1,94 +1,92 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) { vm_paddr_t pa; vm_page_t m; int pflags; void *va; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; #ifndef __mips_n64 pflags &= ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); pflags |= VM_ALLOC_NOWAIT; #endif for (;;) { m = vm_page_alloc_freelist_domain(domain, VM_FREELIST_DIRECT, pflags); #ifndef __mips_n64 if (m == NULL && vm_page_reclaim_contig(pflags, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) continue; #endif if (m != NULL) break; if ((wait & M_NOWAIT) != 0) return (NULL); vm_wait(NULL); } pa = VM_PAGE_TO_PHYS(m); if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)MIPS_PHYS_TO_DIRECT(pa); - if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) - bzero(va, PAGE_SIZE); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = MIPS_DIRECT_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); vm_page_unwire_noq(m); vm_page_free(m); } diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 55147d47b2bf..2a9318446c0f 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -1,4329 +1,4326 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008-2015 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_oea64.h" void moea64_release_vsid(uint64_t vsid); uintptr_t moea64_get_unique_vsid(void); #define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) #define ENABLE_TRANS(msr) mtmsr(msr) #define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) #define VSID_HASH_MASK 0x0000007fffffffffULL /* * Locking semantics: * * There are two locks of interest: the page locks and the pmap locks, which * protect their individual PVO lists and are locked in that order. The contents * of all PVO entries are protected by the locks of their respective pmaps. * The pmap of any PVO is guaranteed not to change so long as the PVO is linked * into any list. * */ #define PV_LOCK_COUNT PA_LOCK_COUNT static struct mtx_padalign pv_lock[PV_LOCK_COUNT]; /* * Cheap NUMA-izing of the pv locks, to reduce contention across domains. * NUMA domains on POWER9 appear to be indexed as sparse memory spaces, with the * index at (N << 45). */ #ifdef __powerpc64__ #define PV_LOCK_IDX(pa) ((pa_index(pa) * (((pa) >> 45) + 1)) % PV_LOCK_COUNT) #else #define PV_LOCK_IDX(pa) (pa_index(pa) % PV_LOCK_COUNT) #endif #define PV_LOCKPTR(pa) ((struct mtx *)(&pv_lock[PV_LOCK_IDX(pa)])) #define PV_LOCK(pa) mtx_lock(PV_LOCKPTR(pa)) #define PV_UNLOCK(pa) mtx_unlock(PV_LOCKPTR(pa)) #define PV_LOCKASSERT(pa) mtx_assert(PV_LOCKPTR(pa), MA_OWNED) #define PV_PAGE_LOCK(m) PV_LOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_UNLOCK(m) PV_UNLOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_LOCKASSERT(m) PV_LOCKASSERT(VM_PAGE_TO_PHYS(m)) /* Superpage PV lock */ #define PV_LOCK_SIZE (1<pvo_vaddr & PVO_LARGE) && \ (pvo)->pvo_pmap != kernel_pmap) /* Get physical address from PVO. */ #define PVO_PADDR(pvo) moea64_pvo_paddr(pvo) /* MD page flag indicating that the page is a superpage. */ #define MDPG_ATTR_SP 0x40000000 SYSCTL_DECL(_vm_pmap); static SYSCTL_NODE(_vm_pmap, OID_AUTO, sp, CTLFLAG_RD, 0, "SP page mapping counters"); static u_long sp_demotions; SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, demotions, CTLFLAG_RD, &sp_demotions, 0, "SP page demotions"); static u_long sp_mappings; SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, mappings, CTLFLAG_RD, &sp_mappings, 0, "SP page mappings"); static u_long sp_p_failures; SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_failures, CTLFLAG_RD, &sp_p_failures, 0, "SP page promotion failures"); static u_long sp_p_fail_pa; SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_pa, CTLFLAG_RD, &sp_p_fail_pa, 0, "SP page promotion failure: PAs don't match"); static u_long sp_p_fail_flags; SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_flags, CTLFLAG_RD, &sp_p_fail_flags, 0, "SP page promotion failure: page flags don't match"); static u_long sp_p_fail_prot; SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_prot, CTLFLAG_RD, &sp_p_fail_prot, 0, "SP page promotion failure: page protections don't match"); static u_long sp_p_fail_wimg; SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_wimg, CTLFLAG_RD, &sp_p_fail_wimg, 0, "SP page promotion failure: WIMG bits don't match"); static u_long sp_promotions; SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, promotions, CTLFLAG_RD, &sp_promotions, 0, "SP page promotions"); static bool moea64_ps_enabled(pmap_t); static void moea64_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t); static int moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind); static struct pvo_entry *moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree); static void moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m); static void moea64_sp_demote_aligned(struct pvo_entry *sp); static void moea64_sp_demote(struct pvo_entry *pvo); static struct pvo_entry *moea64_sp_unwire(struct pvo_entry *sp); static struct pvo_entry *moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot); static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit); static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit); static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva); /* * Kernel MMU interface */ void moea64_clear_modify(vm_page_t); void moea64_copy_page(vm_page_t, vm_page_t); void moea64_copy_page_dmap(vm_page_t, vm_page_t); void moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); void moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); int moea64_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); void moea64_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void moea64_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t moea64_extract(pmap_t, vm_offset_t); vm_page_t moea64_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); void moea64_init(void); boolean_t moea64_is_modified(vm_page_t); boolean_t moea64_is_prefaultable(pmap_t, vm_offset_t); boolean_t moea64_is_referenced(vm_page_t); int moea64_ts_referenced(vm_page_t); vm_offset_t moea64_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); boolean_t moea64_page_exists_quick(pmap_t, vm_page_t); void moea64_page_init(vm_page_t); int moea64_page_wired_mappings(vm_page_t); int moea64_pinit(pmap_t); void moea64_pinit0(pmap_t); void moea64_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void moea64_qenter(vm_offset_t, vm_page_t *, int); void moea64_qremove(vm_offset_t, int); void moea64_release(pmap_t); void moea64_remove(pmap_t, vm_offset_t, vm_offset_t); void moea64_remove_pages(pmap_t); void moea64_remove_all(vm_page_t); void moea64_remove_write(vm_page_t); void moea64_unwire(pmap_t, vm_offset_t, vm_offset_t); void moea64_zero_page(vm_page_t); void moea64_zero_page_dmap(vm_page_t); void moea64_zero_page_area(vm_page_t, int, int); void moea64_activate(struct thread *); void moea64_deactivate(struct thread *); void *moea64_mapdev(vm_paddr_t, vm_size_t); void *moea64_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); void moea64_unmapdev(vm_offset_t, vm_size_t); vm_paddr_t moea64_kextract(vm_offset_t); void moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma); void moea64_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma); void moea64_kenter(vm_offset_t, vm_paddr_t); boolean_t moea64_dev_direct_mapped(vm_paddr_t, vm_size_t); static void moea64_sync_icache(pmap_t, vm_offset_t, vm_size_t); void moea64_dumpsys_map(vm_paddr_t pa, size_t sz, void **va); void moea64_scan_init(void); vm_offset_t moea64_quick_enter_page(vm_page_t m); vm_offset_t moea64_quick_enter_page_dmap(vm_page_t m); void moea64_quick_remove_page(vm_offset_t addr); boolean_t moea64_page_is_mapped(vm_page_t m); static int moea64_map_user_ptr(pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int moea64_decode_kernel_ptr(vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); static size_t moea64_scan_pmap(void); static void *moea64_dump_pmap_init(unsigned blkpgs); #ifdef __powerpc64__ static void moea64_page_array_startup(long); #endif static int moea64_mincore(pmap_t, vm_offset_t, vm_paddr_t *); static struct pmap_funcs moea64_methods = { .clear_modify = moea64_clear_modify, .copy_page = moea64_copy_page, .copy_pages = moea64_copy_pages, .enter = moea64_enter, .enter_object = moea64_enter_object, .enter_quick = moea64_enter_quick, .extract = moea64_extract, .extract_and_hold = moea64_extract_and_hold, .init = moea64_init, .is_modified = moea64_is_modified, .is_prefaultable = moea64_is_prefaultable, .is_referenced = moea64_is_referenced, .ts_referenced = moea64_ts_referenced, .map = moea64_map, .mincore = moea64_mincore, .page_exists_quick = moea64_page_exists_quick, .page_init = moea64_page_init, .page_wired_mappings = moea64_page_wired_mappings, .pinit = moea64_pinit, .pinit0 = moea64_pinit0, .protect = moea64_protect, .qenter = moea64_qenter, .qremove = moea64_qremove, .release = moea64_release, .remove = moea64_remove, .remove_pages = moea64_remove_pages, .remove_all = moea64_remove_all, .remove_write = moea64_remove_write, .sync_icache = moea64_sync_icache, .unwire = moea64_unwire, .zero_page = moea64_zero_page, .zero_page_area = moea64_zero_page_area, .activate = moea64_activate, .deactivate = moea64_deactivate, .page_set_memattr = moea64_page_set_memattr, .quick_enter_page = moea64_quick_enter_page, .quick_remove_page = moea64_quick_remove_page, .page_is_mapped = moea64_page_is_mapped, #ifdef __powerpc64__ .page_array_startup = moea64_page_array_startup, #endif .ps_enabled = moea64_ps_enabled, .align_superpage = moea64_align_superpage, /* Internal interfaces */ .mapdev = moea64_mapdev, .mapdev_attr = moea64_mapdev_attr, .unmapdev = moea64_unmapdev, .kextract = moea64_kextract, .kenter = moea64_kenter, .kenter_attr = moea64_kenter_attr, .dev_direct_mapped = moea64_dev_direct_mapped, .dumpsys_pa_init = moea64_scan_init, .dumpsys_scan_pmap = moea64_scan_pmap, .dumpsys_dump_pmap_init = moea64_dump_pmap_init, .dumpsys_map_chunk = moea64_dumpsys_map, .map_user_ptr = moea64_map_user_ptr, .decode_kernel_ptr = moea64_decode_kernel_ptr, }; MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods); /* * Get physical address from PVO. * * For superpages, the lower bits are not stored on pvo_pte.pa and must be * obtained from VA. */ static __always_inline vm_paddr_t moea64_pvo_paddr(struct pvo_entry *pvo) { vm_paddr_t pa; pa = (pvo)->pvo_pte.pa & LPTE_RPGN; if (PVO_IS_SP(pvo)) { pa &= ~HPT_SP_MASK; /* This is needed to clear LPTE_LP bits. */ pa |= PVO_VADDR(pvo) & HPT_SP_MASK; } return (pa); } static struct pvo_head * vm_page_to_pvoh(vm_page_t m) { mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED); return (&m->md.mdpg_pvoh); } static struct pvo_entry * alloc_pvo_entry(int bootstrap) { struct pvo_entry *pvo; if (!moea64_initialized || bootstrap) { if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { panic("%s: bpvo pool exhausted, index=%d, size=%d, bytes=%zd." "Try setting machdep.moea64_bpvo_pool_size tunable", __func__, moea64_bpvo_pool_index, moea64_bpvo_pool_size, moea64_bpvo_pool_size * sizeof(struct pvo_entry)); } pvo = &moea64_bpvo_pool[ atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)]; bzero(pvo, sizeof(*pvo)); pvo->pvo_vaddr = PVO_BOOTSTRAP; } else pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT | M_ZERO); return (pvo); } static void init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va) { uint64_t vsid; uint64_t hash; int shift; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pvo->pvo_pmap = pmap; va &= ~ADDR_POFF; pvo->pvo_vaddr |= va; vsid = va_to_vsid(pmap, va); pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) | (vsid << 16); if (pmap == kernel_pmap && (pvo->pvo_vaddr & PVO_LARGE) != 0) shift = moea64_large_page_shift; else shift = ADDR_PIDX_SHFT; hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } static void free_pvo_entry(struct pvo_entry *pvo) { if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) uma_zfree(moea64_pvo_zone, pvo); } void moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte) { lpte->pte_hi = moea64_pte_vpn_from_pvo_vpn(pvo); lpte->pte_hi |= LPTE_VALID; if (pvo->pvo_vaddr & PVO_LARGE) lpte->pte_hi |= LPTE_BIG; if (pvo->pvo_vaddr & PVO_WIRED) lpte->pte_hi |= LPTE_WIRED; if (pvo->pvo_vaddr & PVO_HID) lpte->pte_hi |= LPTE_HID; lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) lpte->pte_lo |= LPTE_BW; else lpte->pte_lo |= LPTE_BR; if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE)) lpte->pte_lo |= LPTE_NOEXEC; } static __inline uint64_t moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint64_t pte_lo; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (LPTE_I | LPTE_G); case VM_MEMATTR_CACHEABLE: return (LPTE_M); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (LPTE_I); case VM_MEMATTR_WRITE_THROUGH: return (LPTE_W | LPTE_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ pte_lo = LPTE_I | LPTE_G; for (i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) { pte_lo &= ~(LPTE_I | LPTE_G); pte_lo |= LPTE_M; break; } } return pte_lo; } /* * Quick sort callout for comparing memory regions. */ static int om_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b) { const struct ofw_map *mapa; const struct ofw_map *mapb; mapa = a; mapb = b; if (mapa->om_pa < mapb->om_pa) return (-1); else if (mapa->om_pa > mapb->om_pa) return (1); else return (0); } static void moea64_add_ofw_mappings(phandle_t mmu, size_t sz) { struct ofw_map translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */ pcell_t acells, trans_cells[sz/sizeof(cell_t)]; struct pvo_entry *pvo; register_t msr; vm_offset_t off; vm_paddr_t pa_base; int i, j; bzero(translations, sz); OF_getencprop(OF_finddevice("/"), "#address-cells", &acells, sizeof(acells)); if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1) panic("moea64_bootstrap: can't get ofw translations"); CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations"); sz /= sizeof(cell_t); for (i = 0, j = 0; i < sz; j++) { translations[j].om_va = trans_cells[i++]; translations[j].om_len = trans_cells[i++]; translations[j].om_pa = trans_cells[i++]; if (acells == 2) { translations[j].om_pa <<= 32; translations[j].om_pa |= trans_cells[i++]; } translations[j].om_mode = trans_cells[i++]; } KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)", i, sz)); sz = j; qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { pa_base = translations[i].om_pa; #ifndef __powerpc64__ if ((translations[i].om_pa >> 32) != 0) panic("OFW translations above 32-bit boundary!"); #endif if (pa_base % PAGE_SIZE) panic("OFW translation not page-aligned (phys)!"); if (translations[i].om_va % PAGE_SIZE) panic("OFW translation not page-aligned (virt)!"); CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x", pa_base, translations[i].om_va, translations[i].om_len); /* Now enter the pages for this mapping */ DISABLE_TRANS(msr); for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) { /* If this address is direct-mapped, skip remapping */ if (hw_direct_map && translations[i].om_va == PHYS_TO_DMAP(pa_base) && moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT) == LPTE_M) continue; PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, translations[i].om_va + off); PMAP_UNLOCK(kernel_pmap); if (pvo != NULL) continue; moea64_kenter(translations[i].om_va + off, pa_base + off); } ENABLE_TRANS(msr); } } #ifdef __powerpc64__ static void moea64_probe_large_page(void) { uint16_t pvr = mfpvr() >> 16; switch (pvr) { case IBM970: case IBM970FX: case IBM970MP: powerpc_sync(); isync(); mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG); powerpc_sync(); isync(); /* FALLTHROUGH */ default: if (moea64_large_page_size == 0) { moea64_large_page_size = 0x1000000; /* 16 MB */ moea64_large_page_shift = 24; } } moea64_large_page_mask = moea64_large_page_size - 1; } static void moea64_bootstrap_slb_prefault(vm_offset_t va, int large) { struct slb *cache; struct slb entry; uint64_t esid, slbe; uint64_t i; cache = PCPU_GET(aim.slb); esid = va >> ADDR_SR_SHFT; slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; for (i = 0; i < 64; i++) { if (cache[i].slbe == (slbe | i)) return; } entry.slbe = slbe; entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT; if (large) entry.slbv |= SLBV_L; slb_insert_kernel(entry.slbe, entry.slbv); } #endif static int moea64_kenter_large(vm_offset_t va, vm_paddr_t pa, uint64_t attr, int bootstrap) { struct pvo_entry *pvo; uint64_t pte_lo; int error; pte_lo = LPTE_M; pte_lo |= attr; pvo = alloc_pvo_entry(bootstrap); pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE; init_pvo_entry(pvo, kernel_pmap, va); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = pa | pte_lo; error = moea64_pvo_enter(pvo, NULL, NULL); if (error != 0) panic("Error %d inserting large page\n", error); return (0); } static void moea64_setup_direct_map(vm_offset_t kernelstart, vm_offset_t kernelend) { register_t msr; vm_paddr_t pa, pkernelstart, pkernelend; vm_offset_t size, off; uint64_t pte_lo; int i; if (moea64_large_page_size == 0) hw_direct_map = 0; DISABLE_TRANS(msr); if (hw_direct_map) { PMAP_LOCK(kernel_pmap); for (i = 0; i < pregions_sz; i++) { for (pa = pregions[i].mr_start; pa < pregions[i].mr_start + pregions[i].mr_size; pa += moea64_large_page_size) { pte_lo = LPTE_M; if (pa & moea64_large_page_mask) { pa &= moea64_large_page_mask; pte_lo |= LPTE_G; } if (pa + moea64_large_page_size > pregions[i].mr_start + pregions[i].mr_size) pte_lo |= LPTE_G; moea64_kenter_large(PHYS_TO_DMAP(pa), pa, pte_lo, 1); } } PMAP_UNLOCK(kernel_pmap); } /* * Make sure the kernel and BPVO pool stay mapped on systems either * without a direct map or on which the kernel is not already executing * out of the direct-mapped region. */ if (kernelstart < DMAP_BASE_ADDRESS) { /* * For pre-dmap execution, we need to use identity mapping * because we will be operating with the mmu on but in the * wrong address configuration until we __restartkernel(). */ for (pa = kernelstart & ~PAGE_MASK; pa < kernelend; pa += PAGE_SIZE) moea64_kenter(pa, pa); } else if (!hw_direct_map) { pkernelstart = kernelstart & ~DMAP_BASE_ADDRESS; pkernelend = kernelend & ~DMAP_BASE_ADDRESS; for (pa = pkernelstart & ~PAGE_MASK; pa < pkernelend; pa += PAGE_SIZE) moea64_kenter(pa | DMAP_BASE_ADDRESS, pa); } if (!hw_direct_map) { size = moea64_bpvo_pool_size*sizeof(struct pvo_entry); off = (vm_offset_t)(moea64_bpvo_pool); for (pa = off; pa < off + size; pa += PAGE_SIZE) moea64_kenter(pa, pa); /* Map exception vectors */ for (pa = EXC_RSVD; pa < EXC_LAST; pa += PAGE_SIZE) moea64_kenter(pa | DMAP_BASE_ADDRESS, pa); } ENABLE_TRANS(msr); /* * Allow user to override unmapped_buf_allowed for testing. * XXXKIB Only direct map implementation was tested. */ if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed)) unmapped_buf_allowed = hw_direct_map; } /* Quick sort callout for comparing physical addresses. */ static int pa_cmp(const void *a, const void *b) { const vm_paddr_t *pa = a, *pb = b; if (*pa < *pb) return (-1); else if (*pa > *pb) return (1); else return (0); } void moea64_early_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) { int i, j; vm_size_t physsz, hwphyssz; vm_paddr_t kernelphysstart, kernelphysend; int rm_pavail; /* Level 0 reservations consist of 4096 pages (16MB superpage). */ vm_level_0_order = 12; #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; /* Make sure battable is zero, since we have no BAT */ for (i = 0; i < 16; i++) { battable[i].batu = 0; battable[i].batl = 0; } #else /* Install trap handlers for SLBs */ bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap); bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap); __syncicache((void *)EXC_DSE, 0x80); __syncicache((void *)EXC_ISE, 0x80); #endif kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS; kernelphysend = kernelend & ~DMAP_BASE_ADDRESS; /* Get physical memory regions from firmware */ mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea64_bootstrap: physical memory"); if (PHYS_AVAIL_ENTRIES < regions_sz) panic("moea64_bootstrap: phys_avail too small"); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++, j += 2) { CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)", regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; phys_avail_count++; physsz += regions[i].mr_size; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } /* Check for overlap with the kernel and exception vectors */ rm_pavail = 0; for (j = 0; j < 2*phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (phys_avail[j] >= kernelphysstart && phys_avail[j+1] <= kernelphysend) { phys_avail[j] = phys_avail[j+1] = ~0; rm_pavail++; continue; } if (kernelphysstart >= phys_avail[j] && kernelphysstart < phys_avail[j+1]) { if (kernelphysend < phys_avail[j+1]) { phys_avail[2*phys_avail_count] = (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2*phys_avail_count + 1] = phys_avail[j+1]; phys_avail_count++; } phys_avail[j+1] = kernelphysstart & ~PAGE_MASK; } if (kernelphysend >= phys_avail[j] && kernelphysend < phys_avail[j+1]) { if (kernelphysstart > phys_avail[j]) { phys_avail[2*phys_avail_count] = phys_avail[j]; phys_avail[2*phys_avail_count + 1] = kernelphysstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; } } /* Remove physical available regions marked for removal (~0) */ if (rm_pavail) { qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]), pa_cmp); phys_avail_count -= rm_pavail; for (i = 2*phys_avail_count; i < 2*(phys_avail_count + rm_pavail); i+=2) phys_avail[i] = phys_avail[i+1] = 0; } physmem = btoc(physsz); #ifdef PTEGCOUNT moea64_pteg_count = PTEGCOUNT; #else moea64_pteg_count = 0x1000; while (moea64_pteg_count < physmem) moea64_pteg_count <<= 1; moea64_pteg_count >>= 1; #endif /* PTEGCOUNT */ } void moea64_mid_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) { int i; /* * Set PTEG mask */ moea64_pteg_mask = moea64_pteg_count - 1; /* * Initialize SLB table lock and page locks */ mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF); for (i = 0; i < PV_LOCK_COUNT; i++) mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF); /* * Initialise the bootstrap pvo pool. */ TUNABLE_INT_FETCH("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size); if (moea64_bpvo_pool_size == 0) { if (!hw_direct_map) moea64_bpvo_pool_size = ((ptoa((uintmax_t)physmem) * sizeof(struct vm_page)) / (PAGE_SIZE * PAGE_SIZE)) * BPVO_POOL_EXPANSION_FACTOR; else moea64_bpvo_pool_size = BPVO_POOL_SIZE; } if (boothowto & RB_VERBOSE) { printf("mmu_oea64: bpvo pool entries = %d, bpvo pool size = %zu MB\n", moea64_bpvo_pool_size, moea64_bpvo_pool_size*sizeof(struct pvo_entry) / 1048576); } moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc( moea64_bpvo_pool_size*sizeof(struct pvo_entry), PAGE_SIZE); moea64_bpvo_pool_index = 0; /* Place at address usable through the direct map */ if (hw_direct_map) moea64_bpvo_pool = (struct pvo_entry *) PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool); /* * Make sure kernel vsid is allocated as well as VSID 0. */ #ifndef __powerpc64__ moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW] |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); moea64_vsid_bitmap[0] |= 1; #endif /* * Initialize the kernel pmap (which is statically allocated). */ #ifdef __powerpc64__ for (i = 0; i < 64; i++) { pcpup->pc_aim.slb[i].slbv = 0; pcpup->pc_aim.slb[i].slbe = 0; } #else for (i = 0; i < 16; i++) kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; #endif kernel_pmap->pmap_phys = kernel_pmap; CPU_FILL(&kernel_pmap->pm_active); RB_INIT(&kernel_pmap->pmap_pvo); PMAP_LOCK_INIT(kernel_pmap); /* * Now map in all the other buffers we allocated earlier */ moea64_setup_direct_map(kernelstart, kernelend); } void moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) { ihandle_t mmui; phandle_t chosen; phandle_t mmu; ssize_t sz; int i; vm_offset_t pa, va; void *dpcpu; /* * Set up the Open Firmware pmap and add its mappings if not in real * mode. */ chosen = OF_finddevice("/chosen"); if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) { mmu = OF_instance_to_package(mmui); if (mmu == -1 || (sz = OF_getproplen(mmu, "translations")) == -1) sz = 0; if (sz > 6144 /* tmpstksz - 2 KB headroom */) panic("moea64_bootstrap: too many ofw translations"); if (sz > 0) moea64_add_ofw_mappings(mmu, sz); } /* * Calculate the last available physical address. */ Maxmem = 0; for (i = 0; phys_avail[i + 2] != 0; i += 2) Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); /* * Initialize MMU. */ pmap_cpu_bootstrap(0); mtmsr(mfmsr() | PSL_DR | PSL_IR); pmap_bootstrapped++; /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Map the entire KVA range into the SLB. We must not fault there. */ #ifdef __powerpc64__ for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH) moea64_bootstrap_slb_prefault(va, 0); #endif /* * Remap any early IO mappings (console framebuffer, etc.) */ bs_remap_earlyboot(); /* * Figure out how far we can extend virtual_end into segment 16 * without running into existing mappings. Segment 16 is guaranteed * to contain neither RAM nor devices (at least on Apple hardware), * but will generally contain some OFW mappings we should not * step on. */ #ifndef __powerpc64__ /* KVA is in high memory on PPC64 */ PMAP_LOCK(kernel_pmap); while (virtual_end < VM_MAX_KERNEL_ADDRESS && moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL) virtual_end += PAGE_SIZE; PMAP_UNLOCK(kernel_pmap); #endif /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + kstack_pages * PAGE_SIZE; CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; thread0.td_kstack_pages = kstack_pages; for (i = 0; i < kstack_pages; i++) { moea64_kenter(va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE); msgbufp = (struct msgbuf *)virtual_avail; va = virtual_avail; virtual_avail += round_page(msgbufsize); while (va < virtual_avail) { moea64_kenter(va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the dynamic percpu area. */ pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); dpcpu = (void *)virtual_avail; va = virtual_avail; virtual_avail += DPCPU_SIZE; while (va < virtual_avail) { moea64_kenter(va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } dpcpu_init(dpcpu, curcpu); crashdumpmap = (caddr_t)virtual_avail; virtual_avail += MAXDUMPPGS * PAGE_SIZE; /* * Allocate some things for page zeroing. We put this directly * in the page table and use MOEA64_PTE_REPLACE to avoid any * of the PVO book-keeping or other parts of the VM system * from even knowing that this hack exists. */ if (!hw_direct_map) { mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL, MTX_DEF); for (i = 0; i < 2; i++) { moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE; virtual_end -= PAGE_SIZE; moea64_kenter(moea64_scratchpage_va[i], 0); PMAP_LOCK(kernel_pmap); moea64_scratchpage_pvo[i] = moea64_pvo_find_va( kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]); PMAP_UNLOCK(kernel_pmap); } } numa_mem_regions(&numa_pregions, &numapregions_sz); } static void moea64_pmap_init_qpages(void) { struct pcpu *pc; int i; if (hw_direct_map) return; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); PMAP_LOCK(kernel_pmap); pc->pc_aim.qmap_pvo = moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr); PMAP_UNLOCK(kernel_pmap); mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL); /* * Activate a user pmap. This mostly involves setting some non-CPU * state. */ void moea64_activate(struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_SET(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(aim.userslb, pm->pm_slb); __asm __volatile("slbmte %0, %1; isync" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); #else PCPU_SET(curpmap, pm->pmap_phys); mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid); #endif } void moea64_deactivate(struct thread *td) { pmap_t pm; __asm __volatile("isync; slbie %0" :: "r"(USER_ADDR)); pm = &td->td_proc->p_vmspace->vm_pmap; CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(aim.userslb, NULL); #else PCPU_SET(curpmap, NULL); #endif } void moea64_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; vm_page_t m; int64_t refchg; key.pvo_vaddr = sva; PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { if (PVO_IS_SP(pvo)) { if (moea64_sp_pvo_in_range(pvo, sva, eva)) { pvo = moea64_sp_unwire(pvo); continue; } else { CTR1(KTR_PMAP, "%s: demote before unwire", __func__); moea64_sp_demote(pvo); } } if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; refchg = moea64_pte_replace(pvo, 0 /* No invalidation */); if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { if (refchg < 0) refchg = LPTE_CHG; m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } pm->pm_stats.wired_count--; } PMAP_UNLOCK(pm); } static int moea64_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { struct pvo_entry *pvo; vm_paddr_t pa; vm_page_t m; int val; bool managed; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, addr); if (pvo != NULL) { pa = PVO_PADDR(pvo); m = PHYS_TO_VM_PAGE(pa); managed = (pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED; if (PVO_IS_SP(pvo)) val = MINCORE_INCORE | MINCORE_PSIND(1); else val = MINCORE_INCORE; } else { PMAP_UNLOCK(pmap); return (0); } PMAP_UNLOCK(pmap); if (m == NULL) return (0); if (managed) { if (moea64_is_modified(m)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if (moea64_is_referenced(m)) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { *pap = pa; } return (val); } /* * This goes through and sets the physical address of our * special scratch PTE to the PA we want to zero or copy. Because * of locking issues (this can get called in pvo_enter() by * the UMA allocator), we can't use most other utility functions here */ static __inline void moea64_set_scratchpage_pa(int which, vm_paddr_t pa) { struct pvo_entry *pvo; KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!")); mtx_assert(&moea64_scratchpage_mtx, MA_OWNED); pvo = moea64_scratchpage_pvo[which]; PMAP_LOCK(pvo->pvo_pmap); pvo->pvo_pte.pa = moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa; moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); PMAP_UNLOCK(pvo->pvo_pmap); isync(); } void moea64_copy_page(vm_page_t msrc, vm_page_t mdst) { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(0, VM_PAGE_TO_PHYS(msrc)); moea64_set_scratchpage_pa(1, VM_PAGE_TO_PHYS(mdst)); bcopy((void *)moea64_scratchpage_va[0], (void *)moea64_scratchpage_va[1], PAGE_SIZE); mtx_unlock(&moea64_scratchpage_mtx); } void moea64_copy_page_dmap(vm_page_t msrc, vm_page_t mdst) { vm_offset_t dst; vm_offset_t src; dst = VM_PAGE_TO_PHYS(mdst); src = VM_PAGE_TO_PHYS(msrc); bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst), PAGE_SIZE); } inline void moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } void moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; mtx_lock(&moea64_scratchpage_mtx); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); moea64_set_scratchpage_pa(0, VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); moea64_set_scratchpage_pa(1, VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } mtx_unlock(&moea64_scratchpage_mtx); } void moea64_zero_page_area(vm_page_t m, int off, int size) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (size + off > PAGE_SIZE) panic("moea64_zero_page: size + off > PAGE_SIZE"); if (hw_direct_map) { bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(0, pa); bzero((caddr_t)moea64_scratchpage_va[0] + off, size); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Zero a page of physical memory by temporarily mapping it */ void moea64_zero_page(vm_page_t m) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); vm_offset_t va, off; mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(0, pa); va = moea64_scratchpage_va[0]; for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); mtx_unlock(&moea64_scratchpage_mtx); } void moea64_zero_page_dmap(vm_page_t m) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); vm_offset_t va, off; va = PHYS_TO_DMAP(pa); for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); } vm_offset_t moea64_quick_enter_page(vm_page_t m) { struct pvo_entry *pvo; vm_paddr_t pa = VM_PAGE_TO_PHYS(m); /* * MOEA64_PTE_REPLACE does some locking, so we can't just grab * a critical section and access the PCPU data like on i386. * Instead, pin the thread and grab the PCPU lock to prevent * a preempting thread from using the same PCPU data. */ sched_pin(); mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED); pvo = PCPU_GET(aim.qmap_pvo); mtx_lock(PCPU_PTR(aim.qmap_lock)); pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) | (uint64_t)pa; moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); isync(); return (PCPU_GET(qmap_addr)); } vm_offset_t moea64_quick_enter_page_dmap(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void moea64_quick_remove_page(vm_offset_t addr) { mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED); KASSERT(PCPU_GET(qmap_addr) == addr, ("moea64_quick_remove_page: invalid address")); mtx_unlock(PCPU_PTR(aim.qmap_lock)); sched_unpin(); } boolean_t moea64_page_is_mapped(vm_page_t m) { return (!LIST_EMPTY(&(m)->md.mdpg_pvoh)); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct pvo_entry *pvo, *oldpvo, *tpvo; struct pvo_head *pvo_head; uint64_t pte_lo; int error; vm_paddr_t pa; if ((m->oflags & VPO_UNMANAGED) == 0) { if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); else VM_OBJECT_ASSERT_LOCKED(m->object); } if (psind > 0) return (moea64_sp_enter(pmap, va, m, prot, flags, psind)); pvo = alloc_pvo_entry(0); if (pvo == NULL) return (KERN_RESOURCE_SHORTAGE); pvo->pvo_pmap = NULL; /* to be filled in later */ pvo->pvo_pte.prot = prot; pa = VM_PAGE_TO_PHYS(m); pte_lo = moea64_calc_wimg(pa, pmap_page_get_memattr(m)); pvo->pvo_pte.pa = pa | pte_lo; if ((flags & PMAP_ENTER_WIRED) != 0) pvo->pvo_vaddr |= PVO_WIRED; if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) { pvo_head = NULL; } else { pvo_head = &m->md.mdpg_pvoh; pvo->pvo_vaddr |= PVO_MANAGED; } PV_LOCK(pa); PMAP_LOCK(pmap); if (pvo->pvo_pmap == NULL) init_pvo_entry(pvo, pmap, va); if (moea64_ps_enabled(pmap) && (tpvo = moea64_pvo_find_va(pmap, va & ~HPT_SP_MASK)) != NULL && PVO_IS_SP(tpvo)) { /* Demote SP before entering a regular page */ CTR2(KTR_PMAP, "%s: demote before enter: va=%#jx", __func__, (uintmax_t)va); moea64_sp_demote_aligned(tpvo); } if (prot & VM_PROT_WRITE) if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); error = moea64_pvo_enter(pvo, pvo_head, &oldpvo); if (error == EEXIST) { if (oldpvo->pvo_vaddr == pvo->pvo_vaddr && oldpvo->pvo_pte.pa == pvo->pvo_pte.pa && oldpvo->pvo_pte.prot == prot) { /* Identical mapping already exists */ error = 0; /* If not in page table, reinsert it */ if (moea64_pte_synch(oldpvo) < 0) { STAT_MOEA64(moea64_pte_overflow--); moea64_pte_insert(oldpvo); } /* Then just clean up and go home */ PMAP_UNLOCK(pmap); PV_UNLOCK(pa); free_pvo_entry(pvo); pvo = NULL; goto out; } else { /* Otherwise, need to kill it first */ KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old " "mapping does not match new mapping")); moea64_pvo_remove_from_pmap(oldpvo); moea64_pvo_enter(pvo, pvo_head, NULL); } } PMAP_UNLOCK(pmap); PV_UNLOCK(pa); /* Free any dead pages */ if (error == EEXIST) { moea64_pvo_remove_from_page(oldpvo); free_pvo_entry(oldpvo); } out: /* * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ if (pmap != kernel_pmap && (m->a.flags & PGA_EXECUTABLE) == 0 && (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(pmap, va, pa, PAGE_SIZE); } #if VM_NRESERVLEVEL > 0 /* * Try to promote pages. * * If the VA of the entered page is not aligned with its PA, * don't try page promotion as it is not possible. * This reduces the number of promotion failures dramatically. */ if (moea64_ps_enabled(pmap) && pmap != kernel_pmap && pvo != NULL && (pvo->pvo_vaddr & PVO_MANAGED) != 0 && (va & HPT_SP_MASK) == (pa & HPT_SP_MASK) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) moea64_sp_promote(pmap, va, m); #endif return (KERN_SUCCESS); } static void moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz) { /* * This is much trickier than on older systems because * we can't sync the icache on physical addresses directly * without a direct map. Instead we check a couple of cases * where the memory is already mapped in and, failing that, * use the same trick we use for page zeroing to create * a temporary mapping for this physical address. */ if (!pmap_bootstrapped) { /* * If PMAP is not bootstrapped, we are likely to be * in real mode. */ __syncicache((void *)(uintptr_t)pa, sz); } else if (pmap == kernel_pmap) { __syncicache((void *)va, sz); } else if (hw_direct_map) { __syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz); } else { /* Use the scratch page to set up a temp mapping */ mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(1, pa & ~ADDR_POFF); __syncicache((void *)(moea64_scratchpage_va[1] + (va & ADDR_POFF)), sz); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void moea64_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; vm_offset_t va; int8_t psind; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & HPT_SP_MASK) == 0 && va + HPT_SP_SIZE <= end && m->psind == 1 && moea64_ps_enabled(pm)) psind = 1; else psind = 0; moea64_enter(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, psind); if (psind == 1) m = &m[HPT_SP_SIZE / PAGE_SIZE - 1]; m = TAILQ_NEXT(m, listq); } } void moea64_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { moea64_enter(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0); } vm_paddr_t moea64_extract(pmap_t pm, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; PMAP_LOCK(pm); pvo = moea64_pvo_find_va(pm, va); if (pvo == NULL) pa = 0; else pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t moea64_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct pvo_entry *pvo; vm_page_t m; m = NULL; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) { m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); if (!vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } static void * moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { struct pvo_entry *pvo; vm_offset_t va; vm_page_t m; int needed_lock; /* * This entire routine is a horrible hack to avoid bothering kmem * for new KVA addresses. Because this can get called from inside * kmem allocation routines, calling kmem for a new address here * can lead to multiply locking non-recursive mutexes. */ *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); - m = vm_page_alloc_domain(NULL, 0, domain, - malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); + m = vm_page_alloc_noobj_domain(domain, malloc2vm_flags(wait) | + VM_ALLOC_WIRED); if (m == NULL) return (NULL); va = VM_PAGE_TO_PHYS(m); pvo = alloc_pvo_entry(1 /* bootstrap */); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE; pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M; if (needed_lock) PMAP_LOCK(kernel_pmap); init_pvo_entry(pvo, kernel_pmap, va); pvo->pvo_vaddr |= PVO_WIRED; moea64_pvo_enter(pvo, NULL, NULL); if (needed_lock) PMAP_UNLOCK(kernel_pmap); - if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) - bzero((void *)va, PAGE_SIZE); - return (void *)va; } extern int elf32_nxstack; void moea64_init() { CTR0(KTR_PMAP, "moea64_init"); moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); /* * Are large page mappings enabled? * * While HPT superpages are not better tested, leave it disabled by * default. */ superpages_enabled = 0; TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); if (superpages_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("moea64_init: can't assign to pagesizes[1]")); if (moea64_large_page_size == 0) { printf("mmu_oea64: HW does not support large pages. " "Disabling superpages...\n"); superpages_enabled = 0; } else if (!moea64_has_lp_4k_16m) { printf("mmu_oea64: " "HW does not support mixed 4KB/16MB page sizes. " "Disabling superpages...\n"); superpages_enabled = 0; } else pagesizes[1] = HPT_SP_SIZE; } if (!hw_direct_map) { uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc); } #ifdef COMPAT_FREEBSD32 elf32_nxstack = 1; #endif moea64_initialized = TRUE; } boolean_t moea64_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_referenced: page %p is not managed", m)); return (moea64_query_bit(m, LPTE_REF)); } boolean_t moea64_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (moea64_query_bit(m, LPTE_CHG)); } boolean_t moea64_is_prefaultable(pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; boolean_t rv = TRUE; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL) rv = FALSE; PMAP_UNLOCK(pmap); return (rv); } void moea64_clear_modify(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; moea64_clear_bit(m, LPTE_CHG); } /* * Clear the write and modified bits in each of the given page's mappings. */ void moea64_remove_write(vm_page_t m) { struct pvo_entry *pvo; int64_t refchg, ret; pmap_t pmap; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; powerpc_sync(); PV_PAGE_LOCK(m); refchg = 0; LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { if (PVO_IS_SP(pvo)) { CTR1(KTR_PMAP, "%s: demote before remwr", __func__); moea64_sp_demote(pvo); } pvo->pvo_pte.prot &= ~VM_PROT_WRITE; ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); if (ret < 0) ret = LPTE_CHG; refchg |= ret; if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG) vm_page_dirty(m); vm_page_aflag_clear(m, PGA_WRITEABLE); PV_PAGE_UNLOCK(m); } /* * moea64_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int moea64_ts_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_ts_referenced: page %p is not managed", m)); return (moea64_clear_bit(m, LPTE_REF)); } /* * Modify the WIMG settings of all mappings for a page. */ void moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; int64_t refchg; pmap_t pmap; uint64_t lo; CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x", __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma); if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; } lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) { if (PVO_IS_SP(pvo)) { CTR1(KTR_PMAP, "%s: demote before set_memattr", __func__); moea64_sp_demote(pvo); } pvo->pvo_pte.pa &= ~LPTE_WIMG; pvo->pvo_pte.pa |= lo; refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); if (refchg < 0) refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ? LPTE_CHG : 0; if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } m->md.mdpg_cache_attrs = ma; PV_PAGE_UNLOCK(m); } /* * Map a wired page into kernel virtual address space. */ void moea64_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { int error; struct pvo_entry *pvo, *oldpvo; do { pvo = alloc_pvo_entry(0); if (pvo == NULL) vm_wait(NULL); } while (pvo == NULL); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma); pvo->pvo_vaddr |= PVO_WIRED; PMAP_LOCK(kernel_pmap); oldpvo = moea64_pvo_find_va(kernel_pmap, va); if (oldpvo != NULL) moea64_pvo_remove_from_pmap(oldpvo); init_pvo_entry(pvo, kernel_pmap, va); error = moea64_pvo_enter(pvo, NULL, NULL); PMAP_UNLOCK(kernel_pmap); /* Free any dead pages */ if (oldpvo != NULL) { moea64_pvo_remove_from_page(oldpvo); free_pvo_entry(oldpvo); } if (error != 0) panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va, (uintmax_t)pa, error); } void moea64_kenter(vm_offset_t va, vm_paddr_t pa) { moea64_kenter_attr(va, pa, VM_MEMATTR_DEFAULT); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t moea64_kextract(vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; /* * Shortcut the direct-mapped case when applicable. We never put * anything but 1:1 (or 62-bit aliased) mappings below * VM_MIN_KERNEL_ADDRESS. */ if (va < VM_MIN_KERNEL_ADDRESS) return (va & ~DMAP_BASE_ADDRESS); PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, va); KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, va)); pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(kernel_pmap); return (pa); } /* * Remove a wired page from kernel virtual address space. */ void moea64_kremove(vm_offset_t va) { moea64_remove(kernel_pmap, va, va + PAGE_SIZE); } /* * Provide a kernel pointer corresponding to a given userland pointer. * The returned pointer is valid until the next time this function is * called in this thread. This is used internally in copyin/copyout. */ static int moea64_map_user_ptr(pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) { size_t l; #ifdef __powerpc64__ struct slb *slb; #endif register_t slbv; *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK); l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr); if (l > ulen) l = ulen; if (klen) *klen = l; else if (l != ulen) return (EFAULT); #ifdef __powerpc64__ /* Try lockless look-up first */ slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr); if (slb == NULL) { /* If it isn't there, we need to pre-fault the VSID */ PMAP_LOCK(pm); slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT; PMAP_UNLOCK(pm); } else { slbv = slb->slbv; } /* Mark segment no-execute */ slbv |= SLBV_N; #else slbv = va_to_vsid(pm, (vm_offset_t)uaddr); /* Mark segment no-execute */ slbv |= SR_N; #endif /* If we have already set this VSID, we can just return */ if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv) return (0); __asm __volatile("isync"); curthread->td_pcb->pcb_cpu.aim.usr_segm = (uintptr_t)uaddr >> ADDR_SR_SHFT; curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv; #ifdef __powerpc64__ __asm __volatile ("slbie %0; slbmte %1, %2; isync" :: "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE)); #else __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv)); #endif return (0); } /* * Figure out where a given kernel pointer (usually in a fault) points * to from the VM's perspective, potentially remapping into userland's * address space. */ static int moea64_decode_kernel_ptr(vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr) { vm_offset_t user_sr; if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm; addr &= ADDR_PIDX | ADDR_POFF; addr |= user_sr << ADDR_SR_SHFT; *decoded_addr = addr; *is_user = 1; } else { *decoded_addr = addr; *is_user = 0; } return (0); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. Other architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped region. */ vm_offset_t moea64_map(vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva, va; if (hw_direct_map) { /* * Check if every page in the region is covered by the direct * map. The direct map covers all of physical memory. Use * moea64_calc_wimg() as a shortcut to see if the page is in * physical memory as a way to see if the direct map covers it. */ for (va = pa_start; va < pa_end; va += PAGE_SIZE) if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M) break; if (va == pa_end) return (PHYS_TO_DMAP(pa_start)); } sva = *virt; va = sva; /* XXX respect prot argument */ for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea64_kenter(va, pa_start); *virt = va; return (sva); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t moea64_page_exists_quick(pmap_t pmap, vm_page_t m) { int loops; struct pvo_entry *pvo; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } PV_PAGE_UNLOCK(m); return (rv); } void moea64_page_init(vm_page_t m) { m->md.mdpg_attrs = 0; m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; LIST_INIT(&m->md.mdpg_pvoh); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int moea64_page_wired_mappings(vm_page_t m) { struct pvo_entry *pvo; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED) count++; PV_PAGE_UNLOCK(m); return (count); } static uintptr_t moea64_vsidcontext; uintptr_t moea64_get_unique_vsid(void) { u_int entropy; register_t hash; uint32_t mask; int i; entropy = 0; __asm __volatile("mftb %0" : "=r"(entropy)); mtx_lock(&moea64_slb_mutex); for (i = 0; i < NVSIDS; i += VSID_NBPW) { u_int n; /* * Create a new value by mutiplying by a prime and adding in * entropy from the timebase register. This is to make the * VSID more random so that the PT hash function collides * less often. (Note that the prime casues gcc to do shifts * instead of a multiply.) */ moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy; hash = moea64_vsidcontext & (NVSIDS - 1); if (hash == 0) /* 0 is special, avoid it */ continue; n = hash >> 5; mask = 1 << (hash & (VSID_NBPW - 1)); hash = (moea64_vsidcontext & VSID_HASHMASK); if (moea64_vsid_bitmap[n] & mask) { /* collision? */ /* anything free in this bucket? */ if (moea64_vsid_bitmap[n] == 0xffffffff) { entropy = (moea64_vsidcontext >> 20); continue; } i = ffs(~moea64_vsid_bitmap[n]) - 1; mask = 1 << i; hash &= rounddown2(VSID_HASHMASK, VSID_NBPW); hash |= i; } if (hash == VSID_VRMA) /* also special, avoid this too */ continue; KASSERT(!(moea64_vsid_bitmap[n] & mask), ("Allocating in-use VSID %#zx\n", hash)); moea64_vsid_bitmap[n] |= mask; mtx_unlock(&moea64_slb_mutex); return (hash); } mtx_unlock(&moea64_slb_mutex); panic("%s: out of segments",__func__); } #ifdef __powerpc64__ int moea64_pinit(pmap_t pmap) { RB_INIT(&pmap->pmap_pvo); pmap->pm_slb_tree_root = slb_alloc_tree(); pmap->pm_slb = slb_alloc_user_cache(); pmap->pm_slb_len = 0; return (1); } #else int moea64_pinit(pmap_t pmap) { int i; uint32_t hash; RB_INIT(&pmap->pmap_pvo); if (pmap_bootstrapped) pmap->pmap_phys = (pmap_t)moea64_kextract((vm_offset_t)pmap); else pmap->pmap_phys = pmap; /* * Allocate some segment registers for this pmap. */ hash = moea64_get_unique_vsid(); for (i = 0; i < 16; i++) pmap->pm_sr[i] = VSID_MAKE(i, hash); KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0")); return (1); } #endif /* * Initialize the pmap associated with process 0. */ void moea64_pinit0(pmap_t pm) { PMAP_LOCK_INIT(pm); moea64_pinit(pm); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Set the physical protection on the specified range of this map as requested. */ static void moea64_pvo_protect( pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) { struct vm_page *pg; vm_prot_t oldprot; int32_t refchg; PMAP_LOCK_ASSERT(pm, MA_OWNED); /* * Change the protection of the page. */ oldprot = pvo->pvo_pte.prot; pvo->pvo_pte.prot = prot; pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); /* * If the PVO is in the page table, update mapping */ refchg = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); if (refchg < 0) refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; if (pm != kernel_pmap && pg != NULL && (pg->a.flags & PGA_EXECUTABLE) == 0 && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); moea64_syncicache(pm, PVO_VADDR(pvo), PVO_PADDR(pvo), PAGE_SIZE); } /* * Update vm about the REF/CHG bits if the page is managed and we have * removed write access. */ if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) && (oldprot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } void moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct pvo_entry *pvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, ("moea64_protect: non current pmap")); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { moea64_remove(pm, sva, eva); return; } PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { if (PVO_IS_SP(pvo)) { if (moea64_sp_pvo_in_range(pvo, sva, eva)) { pvo = moea64_sp_protect(pvo, prot); continue; } else { CTR1(KTR_PMAP, "%s: demote before protect", __func__); moea64_sp_demote(pvo); } } moea64_pvo_protect(pm, pvo, prot); } PMAP_UNLOCK(pm); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void moea64_qenter(vm_offset_t va, vm_page_t *m, int count) { while (count-- > 0) { moea64_kenter(va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by moea64_qenter. */ void moea64_qremove(vm_offset_t va, int count) { while (count-- > 0) { moea64_kremove(va); va += PAGE_SIZE; } } void moea64_release_vsid(uint64_t vsid) { int idx, mask; mtx_lock(&moea64_slb_mutex); idx = vsid & (NVSIDS-1); mask = 1 << (idx % VSID_NBPW); idx /= VSID_NBPW; KASSERT(moea64_vsid_bitmap[idx] & mask, ("Freeing unallocated VSID %#jx", vsid)); moea64_vsid_bitmap[idx] &= ~mask; mtx_unlock(&moea64_slb_mutex); } void moea64_release(pmap_t pmap) { /* * Free segment registers' VSIDs */ #ifdef __powerpc64__ slb_free_tree(pmap); slb_free_user_cache(pmap->pm_slb); #else KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0")); moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0])); #endif } /* * Remove all pages mapped by the specified pmap */ void moea64_remove_pages(pmap_t pm) { struct pvo_entry *pvo, *tpvo; struct pvo_dlist tofree; SLIST_INIT(&tofree); PMAP_LOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) { if (pvo->pvo_vaddr & PVO_WIRED) continue; /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(pvo); SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); } PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { pvo = SLIST_FIRST(&tofree); SLIST_REMOVE_HEAD(&tofree, pvo_dlink); moea64_pvo_remove_from_page(pvo); free_pvo_entry(pvo); } } static void moea64_remove_locked(pmap_t pm, vm_offset_t sva, vm_offset_t eva, struct pvo_dlist *tofree) { struct pvo_entry *pvo, *tpvo, key; PMAP_LOCK_ASSERT(pm, MA_OWNED); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { if (PVO_IS_SP(pvo)) { if (moea64_sp_pvo_in_range(pvo, sva, eva)) { tpvo = moea64_sp_remove(pvo, tofree); continue; } else { CTR1(KTR_PMAP, "%s: demote before remove", __func__); moea64_sp_demote(pvo); } } tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(pvo); SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink); } } /* * Remove the given range of addresses from the specified map. */ void moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry *pvo; struct pvo_dlist tofree; /* * Perform an unsynchronized read. This is, however, safe. */ if (pm->pm_stats.resident_count == 0) return; SLIST_INIT(&tofree); PMAP_LOCK(pm); moea64_remove_locked(pm, sva, eva, &tofree); PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { pvo = SLIST_FIRST(&tofree); SLIST_REMOVE_HEAD(&tofree, pvo_dlink); moea64_pvo_remove_from_page(pvo); free_pvo_entry(pvo); } } /* * Remove physical page from all pmaps in which it resides. moea64_pvo_remove() * will reflect changes in pte's back to the vm_page. */ void moea64_remove_all(vm_page_t m) { struct pvo_entry *pvo, *next_pvo; struct pvo_head freequeue; int wasdead; pmap_t pmap; LIST_INIT(&freequeue); PV_PAGE_LOCK(m); LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); wasdead = (pvo->pvo_vaddr & PVO_DEAD); if (!wasdead) { if (PVO_IS_SP(pvo)) { CTR1(KTR_PMAP, "%s: demote before remove_all", __func__); moea64_sp_demote(pvo); } moea64_pvo_remove_from_pmap(pvo); } moea64_pvo_remove_from_page_locked(pvo, m); if (!wasdead) LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); PMAP_UNLOCK(pmap); } KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); KASSERT((m->a.flags & PGA_WRITEABLE) == 0, ("Page still writable")); PV_PAGE_UNLOCK(m); /* Clean up UMA allocations */ LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo) free_pvo_entry(pvo); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from moea64_bootstrap before avail start and end are * calculated. */ vm_offset_t moea64_bootstrap_alloc(vm_size_t size, vm_size_t align) { vm_offset_t s, e; int i, j; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (align != 0) s = roundup2(phys_avail[i], align); else s = phys_avail[i]; e = s + size; if (s < phys_avail[i] || e > phys_avail[i + 1]) continue; if (s + size > platform_real_maxaddr()) continue; if (s == phys_avail[i]) { phys_avail[i] += size; } else if (e == phys_avail[i + 1]) { phys_avail[i + 1] -= size; } else { for (j = phys_avail_count * 2; j > i; j -= 2) { phys_avail[j] = phys_avail[j - 2]; phys_avail[j + 1] = phys_avail[j - 1]; } phys_avail[i + 3] = phys_avail[i + 1]; phys_avail[i + 1] = s; phys_avail[i + 2] = e; phys_avail_count++; } return (s); } panic("moea64_bootstrap_alloc: could not allocate memory"); } static int moea64_pvo_enter(struct pvo_entry *pvo, struct pvo_head *pvo_head, struct pvo_entry **oldpvop) { struct pvo_entry *old_pvo; int err; PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); STAT_MOEA64(moea64_pvo_enter_calls++); /* * Add to pmap list */ old_pvo = RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); if (old_pvo != NULL) { if (oldpvop != NULL) *oldpvop = old_pvo; return (EEXIST); } if (pvo_head != NULL) { LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); } if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count++; pvo->pvo_pmap->pm_stats.resident_count++; /* * Insert it into the hardware page table */ err = moea64_pte_insert(pvo); if (err != 0) { panic("moea64_pvo_enter: overflow"); } STAT_MOEA64(moea64_pvo_entries++); if (pvo->pvo_pmap == kernel_pmap) isync(); #ifdef __powerpc64__ /* * Make sure all our bootstrap mappings are in the SLB as soon * as virtual memory is switched on. */ if (!pmap_bootstrapped) moea64_bootstrap_slb_prefault(PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); #endif return (0); } static void moea64_pvo_remove_from_pmap(struct pvo_entry *pvo) { struct vm_page *pg; int32_t refchg; KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap")); PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO")); /* * If there is an active pte entry, we need to deactivate it */ refchg = moea64_pte_unset(pvo); if (refchg < 0) { /* * If it was evicted from the page table, be pessimistic and * dirty the page. */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) refchg = LPTE_CHG; else refchg = 0; } /* * Update our statistics. */ pvo->pvo_pmap->pm_stats.resident_count--; if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count--; /* * Remove this PVO from the pmap list. */ RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Mark this for the next sweep */ pvo->pvo_vaddr |= PVO_DEAD; /* Send RC bits to VM */ if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); if (pg != NULL) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } } static inline void moea64_pvo_remove_from_page_locked(struct pvo_entry *pvo, vm_page_t m) { KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page")); /* Use NULL pmaps as a sentinel for races in page deletion */ if (pvo->pvo_pmap == NULL) return; pvo->pvo_pmap = NULL; /* * Update vm about page writeability/executability if managed */ PV_LOCKASSERT(PVO_PADDR(pvo)); if (pvo->pvo_vaddr & PVO_MANAGED) { if (m != NULL) { LIST_REMOVE(pvo, pvo_vlink); if (LIST_EMPTY(vm_page_to_pvoh(m))) vm_page_aflag_clear(m, PGA_WRITEABLE | PGA_EXECUTABLE); } } STAT_MOEA64(moea64_pvo_entries--); STAT_MOEA64(moea64_pvo_remove_calls++); } static void moea64_pvo_remove_from_page(struct pvo_entry *pvo) { vm_page_t pg = NULL; if (pvo->pvo_vaddr & PVO_MANAGED) pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); PV_LOCK(PVO_PADDR(pvo)); moea64_pvo_remove_from_page_locked(pvo, pg); PV_UNLOCK(PVO_PADDR(pvo)); } static struct pvo_entry * moea64_pvo_find_va(pmap_t pm, vm_offset_t va) { struct pvo_entry key; PMAP_LOCK_ASSERT(pm, MA_OWNED); key.pvo_vaddr = va & ~ADDR_POFF; return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key)); } static boolean_t moea64_query_bit(vm_page_t m, uint64_t ptebit) { struct pvo_entry *pvo; int64_t ret; boolean_t rv; vm_page_t sp; /* * See if this bit is stored in the page already. * * For superpages, the bit is stored in the first vm page. */ if ((m->md.mdpg_attrs & ptebit) != 0 || ((sp = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK)) != NULL && (sp->md.mdpg_attrs & (ptebit | MDPG_ATTR_SP)) == (ptebit | MDPG_ATTR_SP))) return (TRUE); /* * Examine each PTE. Sync so that any pending REF/CHG bits are * flushed to the PTEs. */ rv = FALSE; powerpc_sync(); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (PVO_IS_SP(pvo)) { ret = moea64_sp_query(pvo, ptebit); /* * If SP was not demoted, check its REF/CHG bits here. */ if (ret != -1) { if ((ret & ptebit) != 0) { rv = TRUE; break; } continue; } /* else, fallthrough */ } ret = 0; /* * See if this pvo has a valid PTE. if so, fetch the * REF/CHG bits from the valid PTE. If the appropriate * ptebit is set, return success. */ PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = moea64_pte_synch(pvo); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0) { atomic_set_32(&m->md.mdpg_attrs, ret & (LPTE_CHG | LPTE_REF)); if (ret & ptebit) { rv = TRUE; break; } } } PV_PAGE_UNLOCK(m); return (rv); } static u_int moea64_clear_bit(vm_page_t m, u_int64_t ptebit) { u_int count; struct pvo_entry *pvo; int64_t ret; /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so * we can reset the right ones). */ powerpc_sync(); /* * For each pvo entry, clear the pte's ptebit. */ count = 0; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (PVO_IS_SP(pvo)) { if ((ret = moea64_sp_clear(pvo, m, ptebit)) != -1) { count += ret; continue; } } ret = 0; PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = moea64_pte_clear(pvo, ptebit); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0 && (ret & ptebit)) count++; } atomic_clear_32(&m->md.mdpg_attrs, ptebit); PV_PAGE_UNLOCK(m); return (count); } boolean_t moea64_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) { struct pvo_entry *pvo, key; vm_offset_t ppa; int error = 0; if (hw_direct_map && mem_valid(pa, size) == 0) return (0); PMAP_LOCK(kernel_pmap); ppa = pa & ~ADDR_POFF; key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa; for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); ppa < pa + size; ppa += PAGE_SIZE, pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { if (pvo == NULL || PVO_PADDR(pvo) != ppa) { error = EFAULT; break; } } PMAP_UNLOCK(kernel_pmap); return (error); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * moea64_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, ppa, offset; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("moea64_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { moea64_kenter_attr(tmpva, ppa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } return ((void *)(va + offset)); } void * moea64_mapdev(vm_paddr_t pa, vm_size_t size) { return moea64_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT); } void moea64_unmapdev(vm_offset_t va, vm_size_t size) { vm_offset_t base, offset; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); moea64_qremove(base, atop(size)); kva_free(base, size); } void moea64_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { struct pvo_entry *pvo; vm_offset_t lim; vm_paddr_t pa; vm_size_t len; if (__predict_false(pm == NULL)) pm = &curthread->td_proc->p_vmspace->vm_pmap; PMAP_LOCK(pm); while (sz > 0) { lim = round_page(va+1); len = MIN(lim - va, sz); pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) { pa = PVO_PADDR(pvo) | (va & ADDR_POFF); moea64_syncicache(pm, va, pa, len); } va += len; sz -= len; } PMAP_UNLOCK(pm); } void moea64_dumpsys_map(vm_paddr_t pa, size_t sz, void **va) { *va = (void *)(uintptr_t)pa; } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void moea64_scan_init() { struct pvo_entry *pvo; vm_offset_t va; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); for (i = 0; i < pregions_sz; i++) { dump_map[i].pa_start = pregions[i].mr_start; dump_map[i].pa_size = pregions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr; dump_map[1].pa_size = round_page(msgbufp->msg_size); /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } #ifdef __powerpc64__ static size_t moea64_scan_pmap() { struct pvo_entry *pvo; vm_paddr_t pa, pa_end; vm_offset_t va, pgva, kstart, kend, kstart_lp, kend_lp; uint64_t lpsize; lpsize = moea64_large_page_size; kstart = trunc_page((vm_offset_t)_etext); kend = round_page((vm_offset_t)_end); kstart_lp = kstart & ~moea64_large_page_mask; kend_lp = (kend + moea64_large_page_mask) & ~moea64_large_page_mask; CTR4(KTR_PMAP, "moea64_scan_pmap: kstart=0x%016lx, kend=0x%016lx, " "kstart_lp=0x%016lx, kend_lp=0x%016lx", kstart, kend, kstart_lp, kend_lp); PMAP_LOCK(kernel_pmap); RB_FOREACH(pvo, pvo_tree, &kernel_pmap->pmap_pvo) { va = pvo->pvo_vaddr; if (va & PVO_DEAD) continue; /* Skip DMAP (except kernel area) */ if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) { if (va & PVO_LARGE) { pgva = va & ~moea64_large_page_mask; if (pgva < kstart_lp || pgva >= kend_lp) continue; } else { pgva = trunc_page(va); if (pgva < kstart || pgva >= kend) continue; } } pa = PVO_PADDR(pvo); if (va & PVO_LARGE) { pa_end = pa + lpsize; for (; pa < pa_end; pa += PAGE_SIZE) { if (vm_phys_is_dumpable(pa)) dump_add_page(pa); } } else { if (vm_phys_is_dumpable(pa)) dump_add_page(pa); } } PMAP_UNLOCK(kernel_pmap); return (sizeof(struct lpte) * moea64_pteg_count * 8); } static struct dump_context dump_ctx; static void * moea64_dump_pmap_init(unsigned blkpgs) { dump_ctx.ptex = 0; dump_ctx.ptex_end = moea64_pteg_count * 8; dump_ctx.blksz = blkpgs * PAGE_SIZE; return (&dump_ctx); } #else static size_t moea64_scan_pmap() { return (0); } static void * moea64_dump_pmap_init(unsigned blkpgs) { return (NULL); } #endif #ifdef __powerpc64__ static void moea64_map_range(vm_offset_t va, vm_paddr_t pa, vm_size_t npages) { for (; npages > 0; --npages) { if (moea64_large_page_size != 0 && (pa & moea64_large_page_mask) == 0 && (va & moea64_large_page_mask) == 0 && npages >= (moea64_large_page_size >> PAGE_SHIFT)) { PMAP_LOCK(kernel_pmap); moea64_kenter_large(va, pa, 0, 0); PMAP_UNLOCK(kernel_pmap); pa += moea64_large_page_size; va += moea64_large_page_size; npages -= (moea64_large_page_size >> PAGE_SHIFT) - 1; } else { moea64_kenter(va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } } } static void moea64_page_array_startup(long pages) { long dom_pages[MAXMEMDOM]; vm_paddr_t pa; vm_offset_t va, vm_page_base; vm_size_t needed, size; long page; int domain; int i; vm_page_base = 0xd000000000000000ULL; /* Short-circuit single-domain systems. */ if (vm_ndomains == 1) { size = round_page(pages * sizeof(struct vm_page)); pa = vm_phys_early_alloc(0, size); vm_page_base = moea64_map(&vm_page_base, pa, pa + size, VM_PROT_READ | VM_PROT_WRITE); vm_page_array_size = pages; vm_page_array = (vm_page_t)vm_page_base; return; } page = 0; for (i = 0; i < MAXMEMDOM; i++) dom_pages[i] = 0; /* Now get the number of pages required per domain. */ for (i = 0; i < vm_phys_nsegs; i++) { domain = vm_phys_segs[i].domain; KASSERT(domain < MAXMEMDOM, ("Invalid vm_phys_segs NUMA domain %d!\n", domain)); /* Get size of vm_page_array needed for this segment. */ size = btoc(vm_phys_segs[i].end - vm_phys_segs[i].start); dom_pages[domain] += size; } for (i = 0; phys_avail[i + 1] != 0; i+= 2) { domain = vm_phys_domain(phys_avail[i]); KASSERT(domain < MAXMEMDOM, ("Invalid phys_avail NUMA domain %d!\n", domain)); size = btoc(phys_avail[i + 1] - phys_avail[i]); dom_pages[domain] += size; } /* * Map in chunks that can get us all 16MB pages. There will be some * overlap between domains, but that's acceptable for now. */ vm_page_array_size = 0; va = vm_page_base; for (i = 0; i < MAXMEMDOM && vm_page_array_size < pages; i++) { if (dom_pages[i] == 0) continue; size = ulmin(pages - vm_page_array_size, dom_pages[i]); size = round_page(size * sizeof(struct vm_page)); needed = size; size = roundup2(size, moea64_large_page_size); pa = vm_phys_early_alloc(i, size); vm_page_array_size += size / sizeof(struct vm_page); moea64_map_range(va, pa, size >> PAGE_SHIFT); /* Scoot up domain 0, to reduce the domain page overlap. */ if (i == 0) vm_page_base += size - needed; va += size; } vm_page_array = (vm_page_t)vm_page_base; vm_page_array_size = pages; } #endif static int64_t moea64_null_method(void) { return (0); } static int64_t moea64_pte_replace_default(struct pvo_entry *pvo, int flags) { int64_t refchg; refchg = moea64_pte_unset(pvo); moea64_pte_insert(pvo); return (refchg); } struct moea64_funcs *moea64_ops; #define DEFINE_OEA64_IFUNC(ret, func, args, def) \ DEFINE_IFUNC(, ret, moea64_##func, args) { \ moea64_##func##_t f; \ if (moea64_ops == NULL) \ return ((moea64_##func##_t)def); \ f = moea64_ops->func; \ return (f != NULL ? f : (moea64_##func##_t)def);\ } void moea64_install(void) { #ifdef __powerpc64__ if (hw_direct_map == -1) { moea64_probe_large_page(); /* Use a direct map if we have large page support */ if (moea64_large_page_size > 0) hw_direct_map = 1; else hw_direct_map = 0; } #endif /* * Default to non-DMAP, and switch over to DMAP functions once we know * we have DMAP. */ if (hw_direct_map) { moea64_methods.quick_enter_page = moea64_quick_enter_page_dmap; moea64_methods.quick_remove_page = NULL; moea64_methods.copy_page = moea64_copy_page_dmap; moea64_methods.zero_page = moea64_zero_page_dmap; moea64_methods.copy_pages = moea64_copy_pages_dmap; } } DEFINE_OEA64_IFUNC(int64_t, pte_replace, (struct pvo_entry *, int), moea64_pte_replace_default) DEFINE_OEA64_IFUNC(int64_t, pte_insert, (struct pvo_entry *), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_unset, (struct pvo_entry *), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_insert_sp, (struct pvo_entry *), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_unset_sp, (struct pvo_entry *), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_replace_sp, (struct pvo_entry *), moea64_null_method) /* Superpage functions */ /* MMU interface */ static bool moea64_ps_enabled(pmap_t pmap) { return (superpages_enabled); } static void moea64_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t sp_offset; if (size < HPT_SP_SIZE) return; CTR4(KTR_PMAP, "%s: offs=%#jx, addr=%p, size=%#jx", __func__, (uintmax_t)offset, addr, (uintmax_t)size); if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); sp_offset = offset & HPT_SP_MASK; if (size - ((HPT_SP_SIZE - sp_offset) & HPT_SP_MASK) < HPT_SP_SIZE || (*addr & HPT_SP_MASK) == sp_offset) return; if ((*addr & HPT_SP_MASK) < sp_offset) *addr = (*addr & ~HPT_SP_MASK) + sp_offset; else *addr = ((*addr + HPT_SP_MASK) & ~HPT_SP_MASK) + sp_offset; } /* Helpers */ static __inline void moea64_pvo_cleanup(struct pvo_dlist *tofree) { struct pvo_entry *pvo; /* clean up */ while (!SLIST_EMPTY(tofree)) { pvo = SLIST_FIRST(tofree); SLIST_REMOVE_HEAD(tofree, pvo_dlink); if (pvo->pvo_vaddr & PVO_DEAD) moea64_pvo_remove_from_page(pvo); free_pvo_entry(pvo); } } static __inline uint16_t pvo_to_vmpage_flags(struct pvo_entry *pvo) { uint16_t flags; flags = 0; if ((pvo->pvo_pte.prot & VM_PROT_WRITE) != 0) flags |= PGA_WRITEABLE; if ((pvo->pvo_pte.prot & VM_PROT_EXECUTE) != 0) flags |= PGA_EXECUTABLE; return (flags); } /* * Check if the given pvo and its superpage are in sva-eva range. */ static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva) { vm_offset_t spva; spva = PVO_VADDR(pvo) & ~HPT_SP_MASK; if (spva >= sva && spva + HPT_SP_SIZE <= eva) { /* * Because this function is intended to be called from loops * that iterate over ordered pvo entries, if the condition * above is true then the pvo must be the first of its * superpage. */ KASSERT(PVO_VADDR(pvo) == spva, ("%s: unexpected unaligned superpage pvo", __func__)); return (true); } return (false); } /* * Update vm about the REF/CHG bits if the superpage is managed and * has (or had) write access. */ static void moea64_sp_refchg_process(struct pvo_entry *sp, vm_page_t m, int64_t sp_refchg, vm_prot_t prot) { vm_page_t m_end; int64_t refchg; if ((sp->pvo_vaddr & PVO_MANAGED) != 0 && (prot & VM_PROT_WRITE) != 0) { for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++) { refchg = sp_refchg | atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } } } /* Superpage ops */ static int moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct pvo_entry *pvo, **pvos; struct pvo_head *pvo_head; vm_offset_t sva; vm_page_t sm; vm_paddr_t pa, spa; bool sync; struct pvo_dlist tofree; int error, i; uint16_t aflags; KASSERT((va & HPT_SP_MASK) == 0, ("%s: va %#jx unaligned", __func__, (uintmax_t)va)); KASSERT(psind == 1, ("%s: invalid psind: %d", __func__, psind)); KASSERT(m->psind == 1, ("%s: invalid m->psind: %d", __func__, m->psind)); KASSERT(pmap != kernel_pmap, ("%s: function called with kernel pmap", __func__)); CTR5(KTR_PMAP, "%s: va=%#jx, pa=%#jx, prot=%#x, flags=%#x, psind=1", __func__, (uintmax_t)va, (uintmax_t)VM_PAGE_TO_PHYS(m), prot, flags); SLIST_INIT(&tofree); sva = va; sm = m; spa = pa = VM_PAGE_TO_PHYS(sm); /* Try to allocate all PVOs first, to make failure handling easier. */ pvos = malloc(HPT_SP_PAGES * sizeof(struct pvo_entry *), M_TEMP, M_NOWAIT); if (pvos == NULL) { CTR1(KTR_PMAP, "%s: failed to alloc pvo array", __func__); return (KERN_RESOURCE_SHORTAGE); } for (i = 0; i < HPT_SP_PAGES; i++) { pvos[i] = alloc_pvo_entry(0); if (pvos[i] == NULL) { CTR1(KTR_PMAP, "%s: failed to alloc pvo", __func__); for (i = i - 1; i >= 0; i--) free_pvo_entry(pvos[i]); free(pvos, M_TEMP); return (KERN_RESOURCE_SHORTAGE); } } SP_PV_LOCK_ALIGNED(spa); PMAP_LOCK(pmap); /* Note: moea64_remove_locked() also clears cached REF/CHG bits. */ moea64_remove_locked(pmap, va, va + HPT_SP_SIZE, &tofree); /* Enter pages */ for (i = 0; i < HPT_SP_PAGES; i++, va += PAGE_SIZE, pa += PAGE_SIZE, m++) { pvo = pvos[i]; pvo->pvo_pte.prot = prot; pvo->pvo_pte.pa = (pa & ~HPT_SP_MASK) | LPTE_LP_4K_16M | moea64_calc_wimg(pa, pmap_page_get_memattr(m)); if ((flags & PMAP_ENTER_WIRED) != 0) pvo->pvo_vaddr |= PVO_WIRED; pvo->pvo_vaddr |= PVO_LARGE; if ((m->oflags & VPO_UNMANAGED) != 0) pvo_head = NULL; else { pvo_head = &m->md.mdpg_pvoh; pvo->pvo_vaddr |= PVO_MANAGED; } init_pvo_entry(pvo, pmap, va); error = moea64_pvo_enter(pvo, pvo_head, NULL); /* * All superpage PVOs were previously removed, so no errors * should occur while inserting the new ones. */ KASSERT(error == 0, ("%s: unexpected error " "when inserting superpage PVO: %d", __func__, error)); } PMAP_UNLOCK(pmap); SP_PV_UNLOCK_ALIGNED(spa); sync = (sm->a.flags & PGA_EXECUTABLE) == 0; /* Note: moea64_pvo_cleanup() also clears page prot. flags. */ moea64_pvo_cleanup(&tofree); pvo = pvos[0]; /* Set vm page flags */ aflags = pvo_to_vmpage_flags(pvo); if (aflags != 0) for (m = sm; m < &sm[HPT_SP_PAGES]; m++) vm_page_aflag_set(m, aflags); /* * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ if (sync && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) moea64_syncicache(pmap, sva, spa, HPT_SP_SIZE); atomic_add_long(&sp_mappings, 1); CTR3(KTR_PMAP, "%s: SP success for va %#jx in pmap %p", __func__, (uintmax_t)sva, pmap); free(pvos, M_TEMP); return (KERN_SUCCESS); } static void moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m) { struct pvo_entry *first, *pvo; vm_paddr_t pa, pa_end; vm_offset_t sva, va_end; int64_t sp_refchg; /* This CTR may generate a lot of output. */ /* CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)va); */ va &= ~HPT_SP_MASK; sva = va; /* Get superpage */ pa = VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK; m = PHYS_TO_VM_PAGE(pa); PMAP_LOCK(pmap); /* * Check if all pages meet promotion criteria. * * XXX In some cases the loop below may be executed for each or most * of the entered pages of a superpage, which can be expensive * (although it was not profiled) and need some optimization. * * Some cases where this seems to happen are: * - When a superpage is first entered read-only and later becomes * read-write. * - When some of the superpage's virtual addresses map to previously * wired/cached pages while others map to pages allocated from a * different physical address range. A common scenario where this * happens is when mmap'ing a file that is already present in FS * block cache and doesn't fill a superpage. */ first = pvo = moea64_pvo_find_va(pmap, sva); for (pa_end = pa + HPT_SP_SIZE; pa < pa_end; pa += PAGE_SIZE, va += PAGE_SIZE) { if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD) != 0) { CTR3(KTR_PMAP, "%s: NULL or dead PVO: pmap=%p, va=%#jx", __func__, pmap, (uintmax_t)va); goto error; } if (PVO_PADDR(pvo) != pa) { CTR5(KTR_PMAP, "%s: PAs don't match: " "pmap=%p, va=%#jx, pvo_pa=%#jx, exp_pa=%#jx", __func__, pmap, (uintmax_t)va, (uintmax_t)PVO_PADDR(pvo), (uintmax_t)pa); atomic_add_long(&sp_p_fail_pa, 1); goto error; } if ((first->pvo_vaddr & PVO_FLAGS_PROMOTE) != (pvo->pvo_vaddr & PVO_FLAGS_PROMOTE)) { CTR5(KTR_PMAP, "%s: PVO flags don't match: " "pmap=%p, va=%#jx, pvo_flags=%#jx, exp_flags=%#jx", __func__, pmap, (uintmax_t)va, (uintmax_t)(pvo->pvo_vaddr & PVO_FLAGS_PROMOTE), (uintmax_t)(first->pvo_vaddr & PVO_FLAGS_PROMOTE)); atomic_add_long(&sp_p_fail_flags, 1); goto error; } if (first->pvo_pte.prot != pvo->pvo_pte.prot) { CTR5(KTR_PMAP, "%s: PVO protections don't match: " "pmap=%p, va=%#jx, pvo_prot=%#x, exp_prot=%#x", __func__, pmap, (uintmax_t)va, pvo->pvo_pte.prot, first->pvo_pte.prot); atomic_add_long(&sp_p_fail_prot, 1); goto error; } if ((first->pvo_pte.pa & LPTE_WIMG) != (pvo->pvo_pte.pa & LPTE_WIMG)) { CTR5(KTR_PMAP, "%s: WIMG bits don't match: " "pmap=%p, va=%#jx, pvo_wimg=%#jx, exp_wimg=%#jx", __func__, pmap, (uintmax_t)va, (uintmax_t)(pvo->pvo_pte.pa & LPTE_WIMG), (uintmax_t)(first->pvo_pte.pa & LPTE_WIMG)); atomic_add_long(&sp_p_fail_wimg, 1); goto error; } pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo); } /* All OK, promote. */ /* * Handle superpage REF/CHG bits. If REF or CHG is set in * any page, then it must be set in the superpage. * * Instead of querying each page, we take advantage of two facts: * 1- If a page is being promoted, it was referenced. * 2- If promoted pages are writable, they were modified. */ sp_refchg = LPTE_REF | ((first->pvo_pte.prot & VM_PROT_WRITE) != 0 ? LPTE_CHG : 0); /* Promote pages */ for (pvo = first, va_end = PVO_VADDR(pvo) + HPT_SP_SIZE; pvo != NULL && PVO_VADDR(pvo) < va_end; pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { pvo->pvo_pte.pa &= ADDR_POFF | ~HPT_SP_MASK; pvo->pvo_pte.pa |= LPTE_LP_4K_16M; pvo->pvo_vaddr |= PVO_LARGE; } moea64_pte_replace_sp(first); /* Send REF/CHG bits to VM */ moea64_sp_refchg_process(first, m, sp_refchg, first->pvo_pte.prot); /* Use first page to cache REF/CHG bits */ atomic_set_32(&m->md.mdpg_attrs, sp_refchg | MDPG_ATTR_SP); PMAP_UNLOCK(pmap); atomic_add_long(&sp_mappings, 1); atomic_add_long(&sp_promotions, 1); CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p", __func__, (uintmax_t)sva, pmap); return; error: atomic_add_long(&sp_p_failures, 1); PMAP_UNLOCK(pmap); } static void moea64_sp_demote_aligned(struct pvo_entry *sp) { struct pvo_entry *pvo; vm_offset_t va, va_end; vm_paddr_t pa; vm_page_t m; pmap_t pmap; int64_t refchg; CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); pmap = sp->pvo_pmap; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pvo = sp; /* Demote pages */ va = PVO_VADDR(pvo); pa = PVO_PADDR(pvo); m = PHYS_TO_VM_PAGE(pa); for (pvo = sp, va_end = va + HPT_SP_SIZE; pvo != NULL && PVO_VADDR(pvo) < va_end; pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo), va += PAGE_SIZE, pa += PAGE_SIZE) { KASSERT(pvo && PVO_VADDR(pvo) == va, ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va)); pvo->pvo_vaddr &= ~PVO_LARGE; pvo->pvo_pte.pa &= ~LPTE_RPGN; pvo->pvo_pte.pa |= pa; } refchg = moea64_pte_replace_sp(sp); /* * Clear SP flag * * XXX It is possible that another pmap has this page mapped as * part of a superpage, but as the SP flag is used only for * caching SP REF/CHG bits, that will be queried if not set * in cache, it should be ok to clear it here. */ atomic_clear_32(&m->md.mdpg_attrs, MDPG_ATTR_SP); /* * Handle superpage REF/CHG bits. A bit set in the superpage * means all pages should consider it set. */ moea64_sp_refchg_process(sp, m, refchg, sp->pvo_pte.prot); atomic_add_long(&sp_demotions, 1); CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p", __func__, (uintmax_t)PVO_VADDR(sp), pmap); } static void moea64_sp_demote(struct pvo_entry *pvo) { PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) { pvo = moea64_pvo_find_va(pvo->pvo_pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK); KASSERT(pvo != NULL, ("%s: missing PVO for va %#jx", __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK))); } moea64_sp_demote_aligned(pvo); } static struct pvo_entry * moea64_sp_unwire(struct pvo_entry *sp) { struct pvo_entry *pvo, *prev; vm_offset_t eva; pmap_t pm; int64_t ret, refchg; CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); pm = sp->pvo_pmap; PMAP_LOCK_ASSERT(pm, MA_OWNED); eva = PVO_VADDR(sp) + HPT_SP_SIZE; refchg = 0; for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("%s: pvo %p is missing PVO_WIRED", __func__, pvo); pvo->pvo_vaddr &= ~PVO_WIRED; ret = moea64_pte_replace(pvo, 0 /* No invalidation */); if (ret < 0) refchg |= LPTE_CHG; else refchg |= ret; pm->pm_stats.wired_count--; } /* Send REF/CHG bits to VM */ moea64_sp_refchg_process(sp, PHYS_TO_VM_PAGE(PVO_PADDR(sp)), refchg, sp->pvo_pte.prot); return (prev); } static struct pvo_entry * moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot) { struct pvo_entry *pvo, *prev; vm_offset_t eva; pmap_t pm; vm_page_t m, m_end; int64_t ret, refchg; vm_prot_t oldprot; CTR3(KTR_PMAP, "%s: va=%#jx, prot=%x", __func__, (uintmax_t)PVO_VADDR(sp), prot); pm = sp->pvo_pmap; PMAP_LOCK_ASSERT(pm, MA_OWNED); oldprot = sp->pvo_pte.prot; m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); KASSERT(m != NULL, ("%s: missing vm page for pa %#jx", __func__, (uintmax_t)PVO_PADDR(sp))); eva = PVO_VADDR(sp) + HPT_SP_SIZE; refchg = 0; for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { pvo->pvo_pte.prot = prot; /* * If the PVO is in the page table, update mapping */ ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); if (ret < 0) refchg |= LPTE_CHG; else refchg |= ret; } /* Send REF/CHG bits to VM */ moea64_sp_refchg_process(sp, m, refchg, oldprot); /* Handle pages that became executable */ if ((m->a.flags & PGA_EXECUTABLE) == 0 && (sp->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((m->oflags & VPO_UNMANAGED) == 0) for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++) vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(pm, PVO_VADDR(sp), PVO_PADDR(sp), HPT_SP_SIZE); } return (prev); } static struct pvo_entry * moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree) { struct pvo_entry *pvo, *tpvo; vm_offset_t eva; pmap_t pm; CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); pm = sp->pvo_pmap; PMAP_LOCK_ASSERT(pm, MA_OWNED); eva = PVO_VADDR(sp) + HPT_SP_SIZE; for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(pvo); SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink); } /* * Clear SP bit * * XXX See comment in moea64_sp_demote_aligned() for why it's * ok to always clear the SP bit on remove/demote. */ atomic_clear_32(&PHYS_TO_VM_PAGE(PVO_PADDR(sp))->md.mdpg_attrs, MDPG_ATTR_SP); return (tpvo); } static int64_t moea64_sp_query_locked(struct pvo_entry *pvo, uint64_t ptebit) { int64_t refchg, ret; vm_offset_t eva; vm_page_t m; pmap_t pmap; struct pvo_entry *sp; pmap = pvo->pvo_pmap; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Get first SP PVO */ if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) { sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK); KASSERT(sp != NULL, ("%s: missing PVO for va %#jx", __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK))); } else sp = pvo; eva = PVO_VADDR(sp) + HPT_SP_SIZE; refchg = 0; for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { ret = moea64_pte_synch(pvo); if (ret > 0) { refchg |= ret & (LPTE_CHG | LPTE_REF); if ((refchg & ptebit) != 0) break; } } /* Save results */ if (refchg != 0) { m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); atomic_set_32(&m->md.mdpg_attrs, refchg | MDPG_ATTR_SP); } return (refchg); } static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit) { int64_t refchg; pmap_t pmap; pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); /* * Check if SP was demoted/removed before pmap lock was acquired. */ if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", __func__, (uintmax_t)PVO_PADDR(pvo)); PMAP_UNLOCK(pmap); return (-1); } refchg = moea64_sp_query_locked(pvo, ptebit); PMAP_UNLOCK(pmap); CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx", __func__, (uintmax_t)PVO_VADDR(pvo), (uintmax_t)PVO_PADDR(pvo), (uintmax_t)refchg); return (refchg); } static int64_t moea64_sp_pvo_clear(struct pvo_entry *pvo, uint64_t ptebit) { int64_t refchg, ret; pmap_t pmap; struct pvo_entry *sp; vm_offset_t eva; vm_page_t m; pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); /* * Check if SP was demoted/removed before pmap lock was acquired. */ if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", __func__, (uintmax_t)PVO_PADDR(pvo)); PMAP_UNLOCK(pmap); return (-1); } /* Get first SP PVO */ if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) { sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK); KASSERT(sp != NULL, ("%s: missing PVO for va %#jx", __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK))); } else sp = pvo; eva = PVO_VADDR(sp) + HPT_SP_SIZE; refchg = 0; for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { ret = moea64_pte_clear(pvo, ptebit); if (ret > 0) refchg |= ret & (LPTE_CHG | LPTE_REF); } m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); atomic_clear_32(&m->md.mdpg_attrs, ptebit); PMAP_UNLOCK(pmap); CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx", __func__, (uintmax_t)PVO_VADDR(sp), (uintmax_t)PVO_PADDR(sp), (uintmax_t)refchg); return (refchg); } static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit) { int64_t count, ret; pmap_t pmap; count = 0; pmap = pvo->pvo_pmap; /* * Since this reference bit is shared by 4096 4KB pages, it * should not be cleared every time it is tested. Apply a * simple "hash" function on the physical page number, the * virtual superpage number, and the pmap address to select * one 4KB page out of the 4096 on which testing the * reference bit will result in clearing that reference bit. * This function is designed to avoid the selection of the * same 4KB page for every 16MB page mapping. * * Always leave the reference bit of a wired mapping set, as * the current state of its reference bit won't affect page * replacement. */ if (ptebit == LPTE_REF && (((VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) ^ (PVO_VADDR(pvo) >> HPT_SP_SHIFT) ^ (uintptr_t)pmap) & (HPT_SP_PAGES - 1)) == 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) { if ((ret = moea64_sp_pvo_clear(pvo, ptebit)) == -1) return (-1); if ((ret & ptebit) != 0) count++; /* * If this page was not selected by the hash function, then assume * its REF bit was set. */ } else if (ptebit == LPTE_REF) { count++; /* * To clear the CHG bit of a single SP page, first it must be demoted. * But if no CHG bit is set, no bit clear and thus no SP demotion is * needed. */ } else { CTR4(KTR_PMAP, "%s: ptebit=%#jx, va=%#jx, pa=%#jx", __func__, (uintmax_t)ptebit, (uintmax_t)PVO_VADDR(pvo), (uintmax_t)PVO_PADDR(pvo)); PMAP_LOCK(pmap); /* * Make sure SP wasn't demoted/removed before pmap lock * was acquired. */ if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", __func__, (uintmax_t)PVO_PADDR(pvo)); PMAP_UNLOCK(pmap); return (-1); } ret = moea64_sp_query_locked(pvo, ptebit); if ((ret & ptebit) != 0) count++; else { PMAP_UNLOCK(pmap); return (0); } moea64_sp_demote(pvo); moea64_pte_clear(pvo, ptebit); /* * Write protect the mapping to a single page so that a * subsequent write access may repromote. */ if ((pvo->pvo_vaddr & PVO_WIRED) == 0) moea64_pvo_protect(pmap, pvo, pvo->pvo_pte.prot & ~VM_PROT_WRITE); PMAP_UNLOCK(pmap); } return (count); } diff --git a/sys/powerpc/aim/mmu_radix.c b/sys/powerpc/aim/mmu_radix.c index ca1a4d2f4797..d29ca730d7d6 100644 --- a/sys/powerpc/aim/mmu_radix.c +++ b/sys/powerpc/aim/mmu_radix.c @@ -1,6470 +1,6462 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018 Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_platform.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* For pseries bit. */ #include #ifdef INVARIANTS #include #endif #define PPC_BITLSHIFT(bit) (sizeof(long)*NBBY - 1 - (bit)) #define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit)) #include "opt_ddb.h" #ifdef DDB static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va); #endif #define PG_W RPTE_WIRED #define PG_V RPTE_VALID #define PG_MANAGED RPTE_MANAGED #define PG_PROMOTED RPTE_PROMOTED #define PG_M RPTE_C #define PG_A RPTE_R #define PG_X RPTE_EAA_X #define PG_RW RPTE_EAA_W #define PG_PTE_CACHE RPTE_ATTR_MASK #define RPTE_SHIFT 9 #define NLS_MASK ((1UL<<5)-1) #define RPTE_ENTRIES (1UL<> L3_PAGE_SIZE_SHIFT); } static __inline vm_pindex_t pmap_pml3e_index(vm_offset_t va) { return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK); } static __inline vm_pindex_t pmap_pml2e_index(vm_offset_t va) { return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK); } static __inline vm_pindex_t pmap_pml1e_index(vm_offset_t va) { return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT); } /* Return various clipped indexes for a given VA */ static __inline vm_pindex_t pmap_pte_index(vm_offset_t va) { return ((va >> PAGE_SHIFT) & RPTE_MASK); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va) { pt_entry_t *pte; vm_paddr_t ptepa; ptepa = (be64toh(*l3e) & NLB_MASK); pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa); return (&pte[pmap_pte_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pt_entry_t * pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va) { pt_entry_t *l3e; vm_paddr_t l3pa; l3pa = (be64toh(*l2e) & NLB_MASK); l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa); return (&l3e[pmap_pml3e_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pt_entry_t * pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va) { pt_entry_t *l2e; vm_paddr_t l2pa; l2pa = (be64toh(*l1e) & NLB_MASK); l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa); return (&l2e[pmap_pml2e_index(va)]); } static __inline pml1_entry_t * pmap_pml1e(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_pml1[pmap_pml1e_index(va)]); } static pt_entry_t * pmap_pml2e(pmap_t pmap, vm_offset_t va) { pt_entry_t *l1e; l1e = pmap_pml1e(pmap, va); if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0) return (NULL); return (pmap_l1e_to_l2e(l1e, va)); } static __inline pt_entry_t * pmap_pml3e(pmap_t pmap, vm_offset_t va) { pt_entry_t *l2e; l2e = pmap_pml2e(pmap, va); if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0) return (NULL); return (pmap_l2e_to_l3e(l2e, va)); } static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pt_entry_t *l3e; l3e = pmap_pml3e(pmap, va); if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0) return (NULL); return (pmap_l3e_to_pte(l3e, va)); } int nkpt = 64; SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, "Number of kernel page table pages allocated on bootup"); vm_paddr_t dmaplimit; SYSCTL_DECL(_vm_pmap); #ifdef INVARIANTS #define VERBOSE_PMAP 0 #define VERBOSE_PROTECT 0 static int pmap_logging; SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN, &pmap_logging, 0, "verbose debug logging"); #endif static u_int64_t KPTphys; /* phys addr of kernel level 1 */ //static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ static vm_offset_t qframe = 0; static struct mtx qframe_mtx; void mmu_radix_activate(struct thread *); void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int); void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t); void mmu_radix_clear_modify(vm_page_t); void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *); int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va); vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); void mmu_radix_kenter(vm_offset_t, vm_paddr_t); vm_paddr_t mmu_radix_kextract(vm_offset_t); void mmu_radix_kremove(vm_offset_t); boolean_t mmu_radix_is_modified(vm_page_t); boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t); boolean_t mmu_radix_is_referenced(vm_page_t); void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t, vm_pindex_t, vm_size_t); boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t); void mmu_radix_page_init(vm_page_t); boolean_t mmu_radix_page_is_mapped(vm_page_t m); void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t); int mmu_radix_page_wired_mappings(vm_page_t); int mmu_radix_pinit(pmap_t); void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); bool mmu_radix_ps_enabled(pmap_t); void mmu_radix_qenter(vm_offset_t, vm_page_t *, int); void mmu_radix_qremove(vm_offset_t, int); vm_offset_t mmu_radix_quick_enter_page(vm_page_t); void mmu_radix_quick_remove_page(vm_offset_t); boolean_t mmu_radix_ts_referenced(vm_page_t); void mmu_radix_release(pmap_t); void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t); void mmu_radix_remove_all(vm_page_t); void mmu_radix_remove_pages(pmap_t); void mmu_radix_remove_write(vm_page_t); void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t); void mmu_radix_zero_page(vm_page_t); void mmu_radix_zero_page_area(vm_page_t, int, int); int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t); void mmu_radix_page_array_startup(long pages); #include "mmu_oea64.h" /* * Kernel MMU interface */ static void mmu_radix_bootstrap(vm_offset_t, vm_offset_t); static void mmu_radix_copy_page(vm_page_t, vm_page_t); static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); static void mmu_radix_growkernel(vm_offset_t); static void mmu_radix_init(void); static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *); static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); static void mmu_radix_pinit0(pmap_t); static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t); static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); static void mmu_radix_unmapdev(vm_offset_t, vm_size_t); static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma); static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t); static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va); static void mmu_radix_scan_init(void); static void mmu_radix_cpu_bootstrap(int ap); static void mmu_radix_tlbie_all(void); static struct pmap_funcs mmu_radix_methods = { .bootstrap = mmu_radix_bootstrap, .copy_page = mmu_radix_copy_page, .copy_pages = mmu_radix_copy_pages, .cpu_bootstrap = mmu_radix_cpu_bootstrap, .growkernel = mmu_radix_growkernel, .init = mmu_radix_init, .map = mmu_radix_map, .mincore = mmu_radix_mincore, .pinit = mmu_radix_pinit, .pinit0 = mmu_radix_pinit0, .mapdev = mmu_radix_mapdev, .mapdev_attr = mmu_radix_mapdev_attr, .unmapdev = mmu_radix_unmapdev, .kenter_attr = mmu_radix_kenter_attr, .dev_direct_mapped = mmu_radix_dev_direct_mapped, .dumpsys_pa_init = mmu_radix_scan_init, .dumpsys_map_chunk = mmu_radix_dumpsys_map, .page_is_mapped = mmu_radix_page_is_mapped, .ps_enabled = mmu_radix_ps_enabled, .align_superpage = mmu_radix_align_superpage, .object_init_pt = mmu_radix_object_init_pt, .protect = mmu_radix_protect, /* pmap dispatcher interface */ .clear_modify = mmu_radix_clear_modify, .copy = mmu_radix_copy, .enter = mmu_radix_enter, .enter_object = mmu_radix_enter_object, .enter_quick = mmu_radix_enter_quick, .extract = mmu_radix_extract, .extract_and_hold = mmu_radix_extract_and_hold, .is_modified = mmu_radix_is_modified, .is_prefaultable = mmu_radix_is_prefaultable, .is_referenced = mmu_radix_is_referenced, .ts_referenced = mmu_radix_ts_referenced, .page_exists_quick = mmu_radix_page_exists_quick, .page_init = mmu_radix_page_init, .page_wired_mappings = mmu_radix_page_wired_mappings, .qenter = mmu_radix_qenter, .qremove = mmu_radix_qremove, .release = mmu_radix_release, .remove = mmu_radix_remove, .remove_all = mmu_radix_remove_all, .remove_write = mmu_radix_remove_write, .unwire = mmu_radix_unwire, .zero_page = mmu_radix_zero_page, .zero_page_area = mmu_radix_zero_page_area, .activate = mmu_radix_activate, .quick_enter_page = mmu_radix_quick_enter_page, .quick_remove_page = mmu_radix_quick_remove_page, .page_set_memattr = mmu_radix_page_set_memattr, .page_array_startup = mmu_radix_page_array_startup, /* Internal interfaces */ .kenter = mmu_radix_kenter, .kextract = mmu_radix_kextract, .kremove = mmu_radix_kremove, .change_attr = mmu_radix_change_attr, .decode_kernel_ptr = mmu_radix_decode_kernel_ptr, .tlbie_all = mmu_radix_tlbie_all, }; MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods); static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va); static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *); static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp); static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e, u_int flags, struct rwlock **lockp); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate); static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void free_pv_chunk(struct pv_chunk *pc); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start); static void pmap_invalidate_all(pmap_t pmap); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush); /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ #define UNIMPLEMENTED() panic("%s not implemented", __func__) #define UNTESTED() panic("%s not yet tested", __func__) /* Number of supported PID bits */ static unsigned int isa3_pid_bits; /* PID to start allocating from */ static unsigned int isa3_base_pid; #define PROCTAB_SIZE_SHIFT (isa3_pid_bits + 4) #define PROCTAB_ENTRIES (1ul << isa3_pid_bits) /* * Map of physical memory regions. */ static struct mem_region *regions, *pregions; static struct numa_mem_region *numa_pregions; static u_int phys_avail_count; static int regions_sz, pregions_sz, numa_pregions_sz; static struct pate *isa3_parttab; static struct prte *isa3_proctab; static vmem_t *asid_arena; extern void bs_remap_earlyboot(void); #define RADIX_PGD_SIZE_SHIFT 16 #define RADIX_PGD_SIZE (1UL << RADIX_PGD_SIZE_SHIFT) #define RADIX_PGD_INDEX_SHIFT (RADIX_PGD_SIZE_SHIFT-3) #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t)) #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t)) #define NUPML1E (RADIX_PGD_SIZE/sizeof(uint64_t)) /* number of userland PML1 pages */ #define NUPDPE (NUPML1E * NL2EPG)/* number of userland PDP pages */ #define NUPDE (NUPDPE * NL3EPG) /* number of userland PD entries */ /* POWER9 only permits a 64k partition table size. */ #define PARTTAB_SIZE_SHIFT 16 #define PARTTAB_SIZE (1UL << PARTTAB_SIZE_SHIFT) #define PARTTAB_HR (1UL << 63) /* host uses radix */ #define PARTTAB_GR (1UL << 63) /* guest uses radix must match host */ /* TLB flush actions. Used as argument to tlbiel_flush() */ enum { TLB_INVAL_SCOPE_LPID = 2, /* invalidate TLBs for current LPID */ TLB_INVAL_SCOPE_GLOBAL = 3, /* invalidate all TLBs */ }; #define NPV_LIST_LOCKS MAXCPU static int pmap_initialized; static vm_paddr_t proctab0pa; static vm_paddr_t parttab_phys; CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv_list_locks[] * elements, but reads are not. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx __exclusive_cache_line pv_chunks_mutex; static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_radix_index(pa) ((pa) >> L3_PAGE_SIZE_SHIFT) #define pa_to_pvh(pa) (&pv_table[pa_radix_index(pa)]) #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* * We support 52 bits, hence: * bits 52 - 31 = 21, 0b10101 * RTS encoding details * bits 0 - 3 of rts -> bits 6 - 8 unsigned long * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long */ #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5)) static int powernv_enabled = 1; static __always_inline void tlbiel_radix_set_isa300(uint32_t set, uint32_t is, uint32_t pid, uint32_t ric, uint32_t prs) { uint64_t rb; uint64_t rs; rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53); rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31); __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) : "memory"); } static void tlbiel_flush_isa3(uint32_t num_sets, uint32_t is) { uint32_t set; __asm __volatile("ptesync": : :"memory"); /* * Flush the first set of the TLB, and the entire Page Walk Cache * and partition table entries. Then flush the remaining sets of the * TLB. */ if (is == TLB_INVAL_SCOPE_GLOBAL) { tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); for (set = 1; set < num_sets; set++) tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); } /* Do the same for process scoped entries. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); for (set = 1; set < num_sets; set++) tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); __asm __volatile("ptesync": : :"memory"); } static void mmu_radix_tlbiel_flush(int scope) { MPASS(scope == TLB_INVAL_SCOPE_LPID || scope == TLB_INVAL_SCOPE_GLOBAL); tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, scope); __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } static void mmu_radix_tlbie_all() { if (powernv_enabled) mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); else mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID); } static void mmu_radix_init_amor(void) { /* * In HV mode, we init AMOR (Authority Mask Override Register) so that * the hypervisor and guest can setup IAMR (Instruction Authority Mask * Register), enable key 0 and set it to 1. * * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) */ mtspr(SPR_AMOR, (3ul << 62)); } static void mmu_radix_init_iamr(void) { /* * Radix always uses key0 of the IAMR to determine if an access is * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction * fetch. */ mtspr(SPR_IAMR, (1ul << 62)); } static void mmu_radix_pid_set(pmap_t pmap) { mtspr(SPR_PID, pmap->pm_pid); isync(); } /* Quick sort callout for comparing physical addresses. */ static int pa_cmp(const void *a, const void *b) { const vm_paddr_t *pa = a, *pb = b; if (*pa < *pb) return (-1); else if (*pa > *pb) return (1); else return (0); } #define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) #define pte_load_clear(ptep) atomic_swap_long(ptep, 0) #define pte_store(ptep, pte) do { \ MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X)); \ *(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \ } while (0) /* * NB: should only be used for adding directories - not for direct mappings */ #define pde_store(ptep, pa) do { \ *(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \ } while (0) #define pte_clear(ptep) do { \ *(u_long *)(ptep) = (u_long)(0); \ } while (0) #define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ /* * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB * (PTE) page mappings have identical settings for the following fields: */ #define PG_PTE_PROMOTE (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \ PG_M | PG_A | RPTE_EAA_MASK | PG_V) static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static void pagezero(vm_offset_t va) { va = trunc_page(va); bzero((void *)va, PAGE_SIZE); } static uint64_t allocpages(int n) { u_int64_t ret; ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE); for (int i = 0; i < n; i++) pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE)); return (ret); } static pt_entry_t * kvtopte(vm_offset_t va) { pt_entry_t *l3e; l3e = pmap_pml3e(kernel_pmap, va); if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0) return (NULL); return (pmap_l3e_to_pte(l3e, va)); } void mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = kvtopte(va); MPASS(pte != NULL); *pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \ RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A); } bool mmu_radix_ps_enabled(pmap_t pmap) { return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); } static pt_entry_t * pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e) { pml3_entry_t *l3e; pt_entry_t *pte; va &= PG_PS_FRAME; l3e = pmap_pml3e(pmap, va); if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0) return (NULL); if (be64toh(*l3e) & RPTE_LEAF) { *is_l3e = 1; return (l3e); } *is_l3e = 0; va &= PG_FRAME; pte = pmap_l3e_to_pte(l3e, va); if (pte == NULL || (be64toh(*pte) & PG_V) == 0) return (NULL); return (pte); } int pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags) { pt_entry_t *pte; pt_entry_t startpte, origpte, newpte; vm_page_t m; int is_l3e; startpte = 0; retry: if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL) return (KERN_INVALID_ADDRESS); origpte = newpte = be64toh(*pte); if (startpte == 0) { startpte = origpte; if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) || ((flags & VM_PROT_READ) && (startpte & PG_A))) { pmap_invalidate_all(pmap); #ifdef INVARIANTS if (VERBOSE_PMAP || pmap_logging) printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n", __func__, pmap, va, flags, origpte); #endif return (KERN_FAILURE); } } #ifdef INVARIANTS if (VERBOSE_PMAP || pmap_logging) printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va, flags, origpte); #endif PMAP_LOCK(pmap); if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL || be64toh(*pte) != origpte) { PMAP_UNLOCK(pmap); return (KERN_FAILURE); } m = PHYS_TO_VM_PAGE(newpte & PG_FRAME); MPASS(m != NULL); switch (flags) { case VM_PROT_READ: if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0) goto protfail; newpte |= PG_A; vm_page_aflag_set(m, PGA_REFERENCED); break; case VM_PROT_WRITE: if ((newpte & RPTE_EAA_W) == 0) goto protfail; if (is_l3e) goto protfail; newpte |= PG_M; vm_page_dirty(m); break; case VM_PROT_EXECUTE: if ((newpte & RPTE_EAA_X) == 0) goto protfail; newpte |= PG_A; vm_page_aflag_set(m, PGA_REFERENCED); break; } if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte))) goto retry; ptesync(); PMAP_UNLOCK(pmap); if (startpte == newpte) return (KERN_FAILURE); return (0); protfail: PMAP_UNLOCK(pmap); return (KERN_PROTECTION_FAILURE); } /* * Returns TRUE if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns FALSE. */ boolean_t mmu_radix_page_is_mapped(vm_page_t m) { struct rwlock *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ static int pmap_cache_bits(vm_memattr_t ma) { if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (RPTE_ATTR_GUARDEDIO); case VM_MEMATTR_CACHEABLE: return (RPTE_ATTR_MEM); case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: case VM_MEMATTR_WRITE_COMBINING: return (RPTE_ATTR_UNGUARDEDIO); } } return (0); } static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start) { ptesync(); if (pmap == kernel_pmap) radix_tlbie_invlpg_kernel_4k(start); else radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); ttusync(); } static void pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start) { ptesync(); if (pmap == kernel_pmap) radix_tlbie_invlpg_kernel_2m(start); else radix_tlbie_invlpg_user_2m(pmap->pm_pid, start); ttusync(); } static void pmap_invalidate_pwc(pmap_t pmap) { ptesync(); if (pmap == kernel_pmap) radix_tlbie_invlpwc_kernel(); else radix_tlbie_invlpwc_user(pmap->pm_pid); ttusync(); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end) { if (((start - end) >> PAGE_SHIFT) > 8) { pmap_invalidate_all(pmap); return; } ptesync(); if (pmap == kernel_pmap) { while (start < end) { radix_tlbie_invlpg_kernel_4k(start); start += PAGE_SIZE; } } else { while (start < end) { radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); start += PAGE_SIZE; } } ttusync(); } static void pmap_invalidate_all(pmap_t pmap) { ptesync(); if (pmap == kernel_pmap) radix_tlbie_flush_kernel(); else radix_tlbie_flush_user(pmap->pm_pid); ttusync(); } static void pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e) { /* * When the PDE has PG_PROMOTED set, the 2MB page mapping was created * by a promotion that did not invalidate the 512 4KB page mappings * that might exist in the TLB. Consequently, at this point, the TLB * may hold both 4KB and 2MB page mappings for the address range [va, * va + L3_PAGE_SIZE). Therefore, the entire range must be invalidated here. * In contrast, when PG_PROMOTED is clear, the TLB will not hold any * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a * single INVLPG suffices to invalidate the 2MB page mapping from the * TLB. */ ptesync(); if ((l3e & PG_PROMOTED) != 0) pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1); else pmap_invalidate_page_2m(pmap, va); pmap_invalidate_pwc(pmap); } static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0x3ffffffffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 }; /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { // if ((cpu_feature2 & CPUID2_POPCNT) == 0) bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); #if 0 free = popcnt_pc_map_pq(pc->pc_map); #endif if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { #ifdef INVARIANTS if (PV_PMAP(pv) == NULL) { printf("corrupted pv_chunk/pv %p\n", pv); printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":"); } MPASS(PV_PMAP(pv) != NULL); MPASS(pv->pv_va != 0); #endif if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & L3_PAGE_MASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; /* Instantiate the remaining NPTEPG - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); va_last = va + L3_PAGE_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 , ("pmap_pv_demote_pde: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = cnttzd(pc->pc_map[field]); pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); } static void reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap) { if (pmap == NULL) return; pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static int active_reclaims = 0; static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pml3_entry_t *l3e; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pc_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = cnttzd(inuse); pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; l3e = pmap_pml3e(pmap, va); if ((be64toh(*l3e) & RPTE_LEAF) != 0) continue; pte = pmap_l3e_to_pte(l3e, va); if ((be64toh(*pte) & PG_W) != 0) continue; tpte = be64toh(pte_load_clear(pte)); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, be64toh(*l3e), &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->ref_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; #ifdef VERBOSE_PV if (pmap != kernel_pmap) printf("%s(%p, %p)\n", __func__, pmap, pv); #endif PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = cnttzd(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); MPASS(PV_PMAP(pv) != NULL); return (pv); } } /* No free items, allocate another chunk */ - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); MPASS(PV_PMAP(pv) != NULL); return (pv); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & L3_PAGE_MASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + L3_PAGE_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; return (TRUE); } else return (FALSE); } vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX]; #ifdef INVARIANTS static void validate_addr(vm_paddr_t addr, vm_size_t size) { vm_paddr_t end = addr + size; bool found = false; for (int i = 0; i < 2 * phys_avail_count; i += 2) { if (addr >= phys_avail_debug[i] && end <= phys_avail_debug[i + 1]) { found = true; break; } } KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array", addr, end)); } #else static void validate_addr(vm_paddr_t addr, vm_size_t size) {} #endif #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A) static vm_paddr_t alloc_pt_page(void) { vm_paddr_t page; page = allocpages(1); pagezero(PHYS_TO_DMAP(page)); return (page); } static void mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end) { pt_entry_t *pte, pteval; vm_paddr_t page; if (bootverbose) printf("%s %lx -> %lx\n", __func__, start, end); while (start < end) { pteval = start | DMAP_PAGE_BITS; pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start)); if ((be64toh(*pte) & RPTE_VALID) == 0) { page = alloc_pt_page(); pde_store(pte, page); } pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start)); if ((start & L2_PAGE_MASK) == 0 && end - start >= L2_PAGE_SIZE) { start += L2_PAGE_SIZE; goto done; } else if ((be64toh(*pte) & RPTE_VALID) == 0) { page = alloc_pt_page(); pde_store(pte, page); } pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start)); if ((start & L3_PAGE_MASK) == 0 && end - start >= L3_PAGE_SIZE) { start += L3_PAGE_SIZE; goto done; } else if ((be64toh(*pte) & RPTE_VALID) == 0) { page = alloc_pt_page(); pde_store(pte, page); } pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start)); start += PAGE_SIZE; done: pte_store(pte, pteval); } } static void mmu_radix_dmap_populate(vm_size_t hwphyssz) { vm_paddr_t start, end; for (int i = 0; i < pregions_sz; i++) { start = pregions[i].mr_start; end = start + pregions[i].mr_size; if (hwphyssz && start >= hwphyssz) break; if (hwphyssz && hwphyssz < end) end = hwphyssz; mmu_radix_dmap_range(start, end); } } static void mmu_radix_setup_pagetables(vm_size_t hwphyssz) { vm_paddr_t ptpages, pages; pt_entry_t *pte; vm_paddr_t l1phys; bzero(kernel_pmap, sizeof(struct pmap)); PMAP_LOCK_INIT(kernel_pmap); ptpages = allocpages(3); l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE); validate_addr(l1phys, RADIX_PGD_SIZE); if (bootverbose) printf("l1phys=%lx\n", l1phys); MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0); for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++) pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE)); kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys); mmu_radix_dmap_populate(hwphyssz); /* * Create page tables for first 128MB of KVA */ pages = ptpages; pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS); *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); pages += PAGE_SIZE; pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS); *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); pages += PAGE_SIZE; pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS); /* * the kernel page table pages need to be preserved in * phys_avail and not overlap with previous allocations */ pages = allocpages(nkpt); if (bootverbose) { printf("phys_avail after dmap populate and nkpt allocation\n"); for (int j = 0; j < 2 * phys_avail_count; j+=2) printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", j, phys_avail[j], j + 1, phys_avail[j + 1]); } KPTphys = pages; for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE) *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE; if (bootverbose) printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); } static void mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end) { vm_paddr_t kpstart, kpend; vm_size_t physsz, hwphyssz; //uint64_t l2virt; int rm_pavail, proctab_size; int i, j; kpstart = start & ~DMAP_BASE_ADDRESS; kpend = end & ~DMAP_BASE_ADDRESS; /* Get physical memory regions from firmware */ mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory"); if (2 * VM_PHYSSEG_MAX < regions_sz) panic("mmu_radix_early_bootstrap: phys_avail too small"); if (bootverbose) for (int i = 0; i < regions_sz; i++) printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n", i, regions[i].mr_start, i, regions[i].mr_size); /* * XXX workaround a simulator bug */ for (int i = 0; i < regions_sz; i++) if (regions[i].mr_start & PAGE_MASK) { regions[i].mr_start += PAGE_MASK; regions[i].mr_start &= ~PAGE_MASK; regions[i].mr_size &= ~PAGE_MASK; } if (bootverbose) for (int i = 0; i < pregions_sz; i++) printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n", i, pregions[i].mr_start, i, pregions[i].mr_size); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++) { if (bootverbose) printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n", i, regions[i].mr_start, i, regions[i].mr_size); if (regions[i].mr_size < PAGE_SIZE) continue; if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + (hwphyssz - physsz); physsz = hwphyssz; phys_avail_count++; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; phys_avail_count++; physsz += regions[i].mr_size; j += 2; } /* Check for overlap with the kernel and exception vectors */ rm_pavail = 0; for (j = 0; j < 2 * phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (phys_avail[j] >= kpstart && phys_avail[j + 1] <= kpend) { phys_avail[j] = phys_avail[j + 1] = ~0; rm_pavail++; continue; } if (kpstart >= phys_avail[j] && kpstart < phys_avail[j + 1]) { if (kpend < phys_avail[j + 1]) { phys_avail[2 * phys_avail_count] = (kpend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2 * phys_avail_count + 1] = phys_avail[j + 1]; phys_avail_count++; } phys_avail[j + 1] = kpstart & ~PAGE_MASK; } if (kpend >= phys_avail[j] && kpend < phys_avail[j + 1]) { if (kpstart > phys_avail[j]) { phys_avail[2 * phys_avail_count] = phys_avail[j]; phys_avail[2 * phys_avail_count + 1] = kpstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kpend & ~PAGE_MASK) + PAGE_SIZE; } } qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp); for (i = 0; i < 2 * phys_avail_count; i++) phys_avail_debug[i] = phys_avail[i]; /* Remove physical available regions marked for removal (~0) */ if (rm_pavail) { phys_avail_count -= rm_pavail; for (i = 2 * phys_avail_count; i < 2*(phys_avail_count + rm_pavail); i+=2) phys_avail[i] = phys_avail[i + 1] = 0; } if (bootverbose) { printf("phys_avail ranges after filtering:\n"); for (j = 0; j < 2 * phys_avail_count; j+=2) printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", j, phys_avail[j], j + 1, phys_avail[j + 1]); } physmem = btoc(physsz); /* XXX assume we're running non-virtualized and * we don't support BHYVE */ if (isa3_pid_bits == 0) isa3_pid_bits = 20; if (powernv_enabled) { parttab_phys = moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE); validate_addr(parttab_phys, PARTTAB_SIZE); for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++) pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE)); } proctab_size = 1UL << PROCTAB_SIZE_SHIFT; proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size); validate_addr(proctab0pa, proctab_size); for (int i = 0; i < proctab_size/PAGE_SIZE; i++) pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE)); mmu_radix_setup_pagetables(hwphyssz); } static void mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end) { int i; vm_paddr_t pa; void *dpcpu; vm_offset_t va; /* * Set up the Open Firmware pmap and add its mappings if not in real * mode. */ if (bootverbose) printf("%s enter\n", __func__); /* * Calculate the last available physical address, and reserve the * vm_page_array (upper bound). */ Maxmem = 0; for (i = 0; phys_avail[i + 2] != 0; i += 2) Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); /* * Remap any early IO mappings (console framebuffer, etc.) */ bs_remap_earlyboot(); /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = allocpages(kstack_pages); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + kstack_pages * PAGE_SIZE; CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; for (i = 0; i < kstack_pages; i++) { mmu_radix_kenter(va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } thread0.td_kstack_pages = kstack_pages; /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK) >> PAGE_SHIFT); msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa); /* * Allocate virtual address space for the dynamic percpu area. */ pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT); dpcpu = (void *)PHYS_TO_DMAP(pa); dpcpu_init(dpcpu, curcpu); crashdumpmap = (caddr_t)virtual_avail; virtual_avail += MAXDUMPPGS * PAGE_SIZE; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ } static void mmu_parttab_init(void) { uint64_t ptcr; isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys); if (bootverbose) printf("%s parttab: %p\n", __func__, isa3_parttab); ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); if (bootverbose) printf("setting ptcr %lx\n", ptcr); mtspr(SPR_PTCR, ptcr); } static void mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab) { uint64_t prev; if (bootverbose) printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab, lpid, pagetab, proctab); prev = be64toh(isa3_parttab[lpid].pagetab); isa3_parttab[lpid].pagetab = htobe64(pagetab); isa3_parttab[lpid].proctab = htobe64(proctab); if (prev & PARTTAB_HR) { __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); } else { __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); } ttusync(); } static void mmu_radix_parttab_init(void) { uint64_t pagetab; mmu_parttab_init(); pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \ RADIX_PGD_INDEX_SHIFT | PARTTAB_HR; mmu_parttab_update(0, pagetab, 0); } static void mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size) { uint64_t pagetab, proctab; pagetab = be64toh(isa3_parttab[0].pagetab); proctab = proctabpa | table_size | PARTTAB_GR; mmu_parttab_update(0, pagetab, proctab); } static void mmu_radix_proctab_init(void) { isa3_base_pid = 1; isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa); isa3_proctab->proctab0 = htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | RADIX_PGD_INDEX_SHIFT); if (powernv_enabled) { mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12); __asm __volatile("ptesync" : : : "memory"); __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); __asm __volatile("eieio; tlbsync; ptesync" : : : "memory"); #ifdef PSERIES } else { int64_t rc; rc = phyp_hcall(H_REGISTER_PROC_TBL, PROC_TABLE_NEW | PROC_TABLE_RADIX | PROC_TABLE_GTSE, proctab0pa, 0, PROCTAB_SIZE_SHIFT - 12); if (rc != H_SUCCESS) panic("mmu_radix_proctab_init: " "failed to register process table: rc=%jd", (intmax_t)rc); #endif } if (bootverbose) printf("process table %p and kernel radix PDE: %p\n", isa3_proctab, kernel_pmap->pm_pml1); mtmsr(mfmsr() | PSL_DR ); mtmsr(mfmsr() & ~PSL_DR); kernel_pmap->pm_pid = isa3_base_pid; isa3_base_pid++; } void mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t oldl3e, *l3e; pt_entry_t *pte; vm_offset_t va, va_next; vm_page_t m; bool anychanged; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; anychanged = false; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1e = pmap_pml1e(pmap, sva); if ((be64toh(*l1e) & PG_V) == 0) { va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } l2e = pmap_l1e_to_l2e(l1e, sva); if ((be64toh(*l2e) & PG_V) == 0) { va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (va_next < sva) va_next = eva; l3e = pmap_l2e_to_l3e(l2e, sva); oldl3e = be64toh(*l3e); if ((oldl3e & PG_V) == 0) continue; else if ((oldl3e & RPTE_LEAF) != 0) { if ((oldl3e & PG_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(va_next, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldl3e & PG_W) == 0) { va = eva; if (va > va_next) va = va_next; va -= PAGE_SIZE; KASSERT(va >= sva, ("mmu_radix_advise: no address gap")); pte = pmap_l3e_to_pte(l3e, va); KASSERT((be64toh(*pte) & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, va, be64toh(*l3e), NULL, &lock); anychanged = true; } if (lock != NULL) rw_wunlock(lock); } if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, sva += PAGE_SIZE) { MPASS(pte == pmap_pte(pmap, sva)); if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME); vm_page_dirty(m); } atomic_clear_long(pte, htobe64(PG_M | PG_A)); } else if ((be64toh(*pte) & PG_A) != 0) atomic_clear_long(pte, htobe64(PG_A)); else goto maybe_invlrng; anychanged = true; continue; maybe_invlrng: if (va != va_next) { anychanged = true; va = va_next; } } if (va != va_next) anychanged = true; } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } /* * Routines used in machine-dependent code */ static void mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end) { uint64_t lpcr; if (bootverbose) printf("%s\n", __func__); hw_direct_map = 1; powernv_enabled = (mfmsr() & PSL_HV) ? 1 : 0; mmu_radix_early_bootstrap(start, end); if (bootverbose) printf("early bootstrap complete\n"); if (powernv_enabled) { lpcr = mfspr(SPR_LPCR); mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); mmu_radix_parttab_init(); mmu_radix_init_amor(); if (bootverbose) printf("powernv init complete\n"); } mmu_radix_init_iamr(); mmu_radix_proctab_init(); mmu_radix_pid_set(kernel_pmap); if (powernv_enabled) mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); else mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID); mmu_radix_late_bootstrap(start, end); numa_mem_regions(&numa_pregions, &numa_pregions_sz); if (bootverbose) printf("%s done\n", __func__); pmap_bootstrapped = 1; dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE); PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS); } static void mmu_radix_cpu_bootstrap(int ap) { uint64_t lpcr; uint64_t ptcr; if (powernv_enabled) { lpcr = mfspr(SPR_LPCR); mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); mtspr(SPR_PTCR, ptcr); mmu_radix_init_amor(); } mmu_radix_init_iamr(); mmu_radix_pid_set(kernel_pmap); if (powernv_enabled) mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); else mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID); } static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0, "2MB page mapping counters"); static COUNTER_U64_DEFINE_EARLY(pmap_l3e_demotions); SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l3e_demotions, "2MB page demotions"); static COUNTER_U64_DEFINE_EARLY(pmap_l3e_mappings); SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l3e_mappings, "2MB page mappings"); static COUNTER_U64_DEFINE_EARLY(pmap_l3e_p_failures); SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l3e_p_failures, "2MB page promotion failures"); static COUNTER_U64_DEFINE_EARLY(pmap_l3e_promotions); SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l3e_promotions, "2MB page promotions"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0, "1GB page mapping counters"); static COUNTER_U64_DEFINE_EARLY(pmap_l2e_demotions); SYSCTL_COUNTER_U64(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2e_demotions, "1GB page demotions"); void mmu_radix_clear_modify(vm_page_t m) { struct md_page *pvh; pmap_t pmap; pv_entry_t next_pv, pv; pml3_entry_t oldl3e, *l3e; pt_entry_t oldpte, *pte; struct rwlock *lock; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); CTR2(KTR_PMAP, "%s(%p)", __func__, m); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->a.flags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l3e = pmap_pml3e(pmap, va); oldl3e = be64toh(*l3e); if ((oldl3e & PG_RW) != 0 && pmap_demote_l3e_locked(pmap, l3e, va, &lock) && (oldl3e & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldl3e & PG_PS_FRAME); pte = pmap_l3e_to_pte(l3e, va); oldpte = be64toh(*pte); while (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW)))) oldpte = be64toh(*pte); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l3e = pmap_pml3e(pmap, pv->pv_va); KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); pte = pmap_l3e_to_pte(l3e, pv->pv_va); if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) { atomic_clear_long(pte, htobe64(PG_M)); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } void mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t va_next; vm_page_t dst_pdpg, dstmpte, srcmpte; bool invalidate_all; CTR6(KTR_PMAP, "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n", __func__, dst_pmap, src_pmap, dst_addr, len, src_addr); if (dst_addr != src_addr) return; lock = NULL; invalidate_all = false; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } for (addr = src_addr; addr < end_addr; addr = va_next) { pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t srcptepaddr, *l3e; pt_entry_t *src_pte, *dst_pte; l1e = pmap_pml1e(src_pmap, addr); if ((be64toh(*l1e) & PG_V) == 0) { va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK; if (va_next < addr) va_next = end_addr; continue; } l2e = pmap_l1e_to_l2e(l1e, addr); if ((be64toh(*l2e) & PG_V) == 0) { va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (va_next < addr) va_next = end_addr; l3e = pmap_l2e_to_l3e(l2e, addr); srcptepaddr = be64toh(*l3e); if (srcptepaddr == 0) continue; if (srcptepaddr & RPTE_LEAF) { if ((addr & L3_PAGE_MASK) != 0 || addr + L3_PAGE_SIZE > end_addr) continue; dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL); if (dst_pdpg == NULL) break; l3e = (pml3_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); l3e = &l3e[pmap_pml3e_index(addr)]; if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { *l3e = htobe64(srcptepaddr & ~PG_W); pmap_resident_count_inc(dst_pmap, L3_PAGE_SIZE / PAGE_SIZE); counter_u64_add(pmap_l3e_mappings, 1); } else dst_pdpg->ref_count--; continue; } srcptepaddr &= PG_FRAME; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->ref_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_pte_index(addr)]; dstmpte = NULL; while (addr < va_next) { pt_entry_t ptetemp; ptetemp = be64toh(*src_pte); /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { if (dstmpte != NULL && dstmpte->pindex == pmap_l3e_pindex(addr)) dstmpte->ref_count++; else if ((dstmpte = pmap_allocpte(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_pte_index(addr)]; if (be64toh(*dst_pte) == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = htobe64(ptetemp & ~(PG_W | PG_M | PG_A)); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not * mapped, paging-structure * caches could nonetheless * have entries that refer to * the freed page table pages. * Invalidate those entries. */ invalidate_all = true; vm_page_free_pages_toq(&free, true); } goto out; } if (dstmpte->ref_count >= srcmpte->ref_count) break; } addr += PAGE_SIZE; if (__predict_false((addr & L3_PAGE_MASK) == 0)) src_pte = pmap_pte(src_pmap, addr); else src_pte++; } } out: if (invalidate_all) pmap_invalidate_all(dst_pmap); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } static void mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst); /* * XXX slow */ bcopy((void *)src, (void *)dst, PAGE_SIZE); } static void mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, a_offset, mb, b_offset, xfersize); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } #if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static int pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pml3_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME); setpde: newpde = be64toh(*firstpte); if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) { CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W))) goto setpde; newpde &= ~RPTE_EAA_W; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = be64toh(*pte); if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W))) goto setpte; oldpte &= ~RPTE_EAA_W; CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx" " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) | (va & ~L3_PAGE_MASK), pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_l3e: page table page is out of range")); KASSERT(mpte->pindex == pmap_l3e_pindex(va), ("pmap_promote_l3e: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx in pmap %p", va, pmap); goto fail; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp); pte_store(pde, PG_PROMOTED | newpde); ptesync(); counter_u64_add(pmap_l3e_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx" " in pmap %p", va, pmap); return (0); fail: counter_u64_add(pmap_l3e_p_failures, 1); return (KERN_FAILURE); } #endif /* VM_NRESERVLEVEL > 0 */ int mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pml3_entry_t *l3e; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv, retrycount; boolean_t nosleep, invalidate_all, invalidate_page; va = trunc_page(va); retrycount = 0; invalidate_page = invalidate_all = false; CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va, m, prot, flags, psind); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((flags & VM_PROT_READ) != 0) newpte |= PG_A; if (prot & VM_PROT_READ) newpte |= RPTE_EAA_R; if ((prot & VM_PROT_WRITE) != 0) newpte |= RPTE_EAA_W; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); if (prot & VM_PROT_EXECUTE) newpte |= PG_X; if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (va >= DMAP_MIN_ADDRESS) newpte |= RPTE_EAA_P; newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs); /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if ((newpte & PG_RW) != 0) newpte |= PG_M; } else newpte |= PG_MANAGED; lock = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: l3e = pmap_pml3e(pmap, va); if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 || pmap_demote_l3e_locked(pmap, l3e, va, &lock))) { pte = pmap_l3e_to_pte(l3e, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME); mpte->ref_count++; } } else if (va < VM_MAXUSER_ADDRESS) { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; } if (__predict_false(retrycount++ == 6)) panic("too many retries"); invalidate_all = true; goto retry; } else panic("pmap_enter: invalid page directory va=%#lx", va); origpte = be64toh(*pte); pv = NULL; /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { #ifdef INVARIANTS if (VERBOSE_PMAP || pmap_logging) { printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --" " asid=%lu curpid=%d name=%s origpte0x%lx\n", pmap, va, m, prot, flags, psind, pmap->pm_pid, curproc->p_pid, curproc->p_comm, origpte); pmap_pte_walk(pmap->pm_pml1, va); } #endif /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) { if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) { if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte))) goto retry; if ((newpte & PG_M) != (origpte & PG_M)) vm_page_dirty(m); if ((newpte & PG_A) != (origpte & PG_A)) vm_page_aflag_set(m, PGA_REFERENCED); ptesync(); } else invalidate_all = true; if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; } goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ origpte = be64toh(pte_load_clear(pte)); KASSERT((origpte & PG_FRAME) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); #ifdef INVARIANTS else if (origpte & PG_MANAGED) { if (pv == NULL) { pmap_page_print_mappings(om); MPASS(pv != NULL); } } #endif if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } if ((origpte & PG_A) != 0) invalidate_page = true; origpte = 0; } else { if (pmap != kernel_pmap) { #ifdef INVARIANTS if (VERBOSE_PMAP || pmap_logging) printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n", pmap, va, m, prot, flags, psind, pmap->pm_pid, curproc->p_pid, curproc->p_comm); #endif } /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } #ifdef VERBOSE_PV else printf("reassigning pv: %p to pmap: %p\n", pv, pmap); #endif CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = be64toh(pte_load_store(pte, htobe64(newpte))); KASSERT((origpte & PG_FRAME) == pa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); invalidate_page = true; /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) { /* * Removing capabilities requires invalidation on POWER */ invalidate_page = true; goto unchanged; } if ((origpte & PG_A) != 0) invalidate_page = true; } else { pte_store(pte, newpte); ptesync(); } unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->ref_count == NPTEPG) && mmu_radix_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0 && pmap_promote_l3e(pmap, l3e, va, &lock) == 0) invalidate_all = true; #endif if (invalidate_all) pmap_invalidate_all(pmap); else if (invalidate_page) pmap_invalidate_page(pmap, va); rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pml3_entry_t newpde; PMAP_LOCK_ASSERT(pmap, MA_OWNED); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) | RPTE_LEAF | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; if (prot & VM_PROT_EXECUTE) newpde |= PG_X; if (prot & VM_PROT_READ) newpde |= RPTE_EAA_R; if (va >= DMAP_MIN_ADDRESS) newpde |= RPTE_EAA_P; return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pml3_entry_t oldl3e, *l3e; vm_page_t mt, pdpg; KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); l3e = &l3e[pmap_pml3e_index(va)]; oldl3e = be64toh(*l3e); if ((oldl3e & PG_V) != 0) { KASSERT(pdpg->ref_count > 1, ("pmap_enter_pde: pdpg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { pdpg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldl3e & RPTE_LEAF) != 0) { /* * The reference to the PD page that was acquired by * pmap_allocl3e() ensures that it won't be freed. * However, if the PDE resulted from a promotion, then * a reserved PT page could be freed. */ (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp); pmap_invalidate_l3e_page(pmap, va, oldl3e); } else { if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e, &free, lockp)) pmap_invalidate_all(pmap); } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME); if (pmap_insert_pt_page(pmap, mt)) { /* * XXX Currently, this can't happen because * we do not perform pmap_enter(psind == 1) * on the kernel pmap. */ panic("pmap_enter_pde: trie insert failed"); } } else KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p", l3e)); } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE; pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pte_store(l3e, newpde); ptesync(); counter_u64_add(pmap_l3e_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (KERN_SUCCESS); } void mmu_radix_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; bool invalidate; VM_OBJECT_ASSERT_LOCKED(m_start->object); CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start, end, m_start, prot); invalidate = false; psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end && m->psind == 1 && mmu_radix_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1]; else mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot, mpte, &lock, &invalidate); m = TAILQ_NEXT(m, listq); } ptesync(); if (lock != NULL) rw_wunlock(lock); if (invalidate) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate) { struct spglist free; pt_entry_t *pte; vm_paddr_t pa; KASSERT(!VA_IS_CLEANMAP(va) || (m->oflags & VPO_UNMANAGED) != 0, ("mmu_radix_enter_quick_locked: managed mapping within the clean submap")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pml3_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_l3e_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->ref_count++; } else { /* * Get the page directory entry */ ptepa = pmap_pml3e(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (ptepa && (be64toh(*ptepa) & PG_V) != 0) { if (be64toh(*ptepa) & RPTE_LEAF) return (NULL); mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME); mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_allocpte(pmap, ptepindex, NULL); if (mpte == NULL) return (mpte); } } pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); pte = &pte[pmap_pte_index(va)]; } else { mpte = NULL; pte = pmap_pte(pmap, va); } if (be64toh(*pte)) { if (mpte != NULL) { mpte->ref_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ *invalidate = true; vm_page_free_pages_toq(&free, true); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs); if (prot & VM_PROT_EXECUTE) pa |= PG_X; else pa |= RPTE_EAA_R; if ((m->oflags & VPO_UNMANAGED) == 0) pa |= PG_MANAGED; pte_store(pte, pa); return (mpte); } void mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; bool invalidate; lock = NULL; invalidate = false; PMAP_LOCK(pmap); mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock, &invalidate); ptesync(); if (lock != NULL) rw_wunlock(lock); if (invalidate) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va) { pml3_entry_t *l3e; pt_entry_t *pte; vm_paddr_t pa; l3e = pmap_pml3e(pmap, va); if (__predict_false(l3e == NULL)) return (0); if (be64toh(*l3e) & RPTE_LEAF) { pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK); pa |= (va & L3_PAGE_MASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pte = pmap_l3e_to_pte(l3e, va); if (__predict_false(pte == NULL)) return (0); pa = be64toh(*pte); pa = (pa & PG_FRAME) | (va & PAGE_MASK); pa |= (va & PAGE_MASK); } return (pa); } vm_page_t mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pml3_entry_t l3e, *l3ep; pt_entry_t pte; vm_page_t m; m = NULL; CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot); PMAP_LOCK(pmap); l3ep = pmap_pml3e(pmap, va); if (l3ep != NULL && (l3e = be64toh(*l3ep))) { if (l3e & RPTE_LEAF) { if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0) m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK)); } else { /* Native endian PTE, do not pass to pmap functions */ pte = be64toh(*pmap_l3e_to_pte(l3ep, va)); if ((pte & PG_V) && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) m = PHYS_TO_VM_PAGE(pte & PG_FRAME); } if (m != NULL && !vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } static void mmu_radix_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pml3_entry_t *l3e; pml2_entry_t *l2e; CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); if (VM_MIN_KERNEL_ADDRESS < addr && addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE)) return; addr = roundup2(addr, L3_PAGE_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l2e = pmap_pml2e(kernel_pmap, kernel_vm_end); if ((be64toh(*l2e) & PG_V) == 0) { /* We need a new PDP entry */ - nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_PAGE_SIZE_SHIFT, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | + nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); - if ((nkpg->flags & PG_ZERO) == 0) - mmu_radix_zero_page(nkpg); + nkpg->pindex = kernel_vm_end >> L2_PAGE_SIZE_SHIFT; paddr = VM_PAGE_TO_PHYS(nkpg); pde_store(l2e, paddr); continue; /* try again */ } l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end); if ((be64toh(*l3e) & PG_V) != 0) { kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } - nkpg = vm_page_alloc(NULL, pmap_l3e_pindex(kernel_vm_end), - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); - if ((nkpg->flags & PG_ZERO) == 0) - mmu_radix_zero_page(nkpg); + nkpg->pindex = pmap_l3e_pindex(kernel_vm_end); paddr = VM_PAGE_TO_PHYS(nkpg); pde_store(l3e, paddr); kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } ptesync(); } static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory"); static uma_zone_t zone_radix_pgd; static int radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused, int flags) { for (int i = 0; i < count; i++) { vm_page_t m = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE, 0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE, VM_MEMATTR_DEFAULT); /* XXX zero on alloc here so we don't have to later */ store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); } return (count); } static void radix_pgd_release(void *arg __unused, void **store, int count) { vm_page_t m; struct spglist free; int page_count; SLIST_INIT(&free); page_count = RADIX_PGD_SIZE/PAGE_SIZE; for (int i = 0; i < count; i++) { /* * XXX selectively remove dmap and KVA entries so we don't * need to bzero */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); for (int j = page_count-1; j >= 0; j--) { vm_page_unwire_noq(&m[j]); SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss); } vm_page_free_pages_toq(&free, false); } } static void mmu_radix_init() { vm_page_t mpte; vm_size_t s; int error, i, pv_npg; /* XXX is this really needed for POWER? */ /* L1TF, reserve page @0 unconditionally */ vm_page_blacklist_add(0, bootverbose); zone_radix_pgd = uma_zcache_create("radix_pgd_cache", RADIX_PGD_SIZE, NULL, NULL, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif radix_pgd_import, radix_pgd_release, NULL, UMA_ZONE_NOBUCKET); /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ PMAP_LOCK(kernel_pmap); for (i = 0; i < nkpt; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range size: %lu", vm_page_array_size)); mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte); //pmap_insert_pt_page(kernel_pmap, mpte); mpte->ref_count = 1; } PMAP_UNLOCK(kernel_pmap); vm_wire_add(nkpt); CTR1(KTR_PMAP, "%s()", __func__); TAILQ_INIT(&pv_dummy.pv_list); /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); if (superpages_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = L3_PAGE_SIZE; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); pmap_initialized = 1; mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, (vmem_addr_t *)&qframe); if (error != 0) panic("qframe allocation failed"); asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<md.pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); mask = 0; if (modified) mask |= PG_RW | PG_M; if (accessed) mask |= PG_V | PG_A; rv = (be64toh(*pte) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pml3e(pmap, pv->pv_va); mask = 0; if (modified) mask |= PG_RW | PG_M; if (accessed) mask |= PG_V | PG_A; rv = (be64toh(*pte) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t mmu_radix_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); CTR2(KTR_PMAP, "%s(%p)", __func__, m); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } boolean_t mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pml3_entry_t *l3e; pt_entry_t *pte; boolean_t rv; CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); rv = FALSE; PMAP_LOCK(pmap); l3e = pmap_pml3e(pmap, addr); if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) { pte = pmap_l3e_to_pte(l3e, addr); rv = (be64toh(*pte) & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } boolean_t mmu_radix_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); CTR2(KTR_PMAP, "%s(%p)", __func__, m); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). * * A DI block is not needed within this function, because * invalidations are performed before the PV list lock is * released. */ boolean_t mmu_radix_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pml3_entry_t oldl3e, *l3e; pt_entry_t *pte; vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; struct spglist free; CTR2(KTR_PMAP, "%s(%p)", __func__, m); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } l3e = pmap_pml3e(pmap, pv->pv_va); oldl3e = be64toh(*l3e); if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "oldpde" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((oldl3e & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (oldl3e & PG_W) == 0) { atomic_clear_long(l3e, htobe64(PG_A)); pmap_invalidate_page(pmap, pv->pv_va); cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } l3e = pmap_pml3e(pmap, pv->pv_va); KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_ts_referenced: found a 2mpage in page %p's pv list", m)); pte = pmap_l3e_to_pte(l3e, pv->pv_va); if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((be64toh(*pte) & PG_A) != 0) { atomic_clear_long(pte, htobe64(PG_A)); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } static vm_offset_t mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start, vm_paddr_t end, int prot __unused) { CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end, prot); return (PHYS_TO_DMAP(start)); } void mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pml3_entry_t *l3e; vm_paddr_t pa, ptepa; vm_page_t p, pdpg; vm_memattr_t ma; CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr, object, pindex, size); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); /* NB: size can be logically ored with addr here */ if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) { if (!mmu_radix_ps_enabled(pmap)) return; if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); ma = p->md.mdpg_cache_attrs; /* * Abort the mapping if the first page is not physically * aligned to a 2MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & L3_PAGE_MASK) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || ma != p->md.mdpg_cache_attrs) return; p = TAILQ_NEXT(p, listq); } PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(ma); pa < ptepa + size; pa += L3_PAGE_SIZE) { pdpg = pmap_allocl3e(pmap, addr, NULL); if (pdpg == NULL) { /* * The creation of mappings below is only an * optimization. If a page directory page * cannot be allocated without blocking, * continue on to the next mapping rather than * blocking. */ addr += L3_PAGE_SIZE; continue; } l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); l3e = &l3e[pmap_pml3e_index(addr)]; if ((be64toh(*l3e) & PG_V) == 0) { pa |= PG_M | PG_A | PG_RW; pte_store(l3e, pa); pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); counter_u64_add(pmap_l3e_mappings, 1); } else { /* Continue on if the PDE is already valid. */ pdpg->ref_count--; KASSERT(pdpg->ref_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", addr)); } addr += L3_PAGE_SIZE; } ptesync(); PMAP_UNLOCK(pmap); } } boolean_t mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } void mmu_radix_page_init(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); TAILQ_INIT(&m->md.pv_list); m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; } int mmu_radix_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); CTR2(KTR_PMAP, "%s(%p)", __func__, m); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); if ((be64toh(*pte) & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pml3e(pmap, pv->pv_va); if ((be64toh(*pte) & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } static void mmu_radix_update_proctab(int pid, pml1_entry_t l1pa) { isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE | l1pa | RADIX_PGD_INDEX_SHIFT); } int mmu_radix_pinit(pmap_t pmap) { vmem_addr_t pid; vm_paddr_t l1pa; CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); /* * allocate the page directory page */ pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK); for (int j = 0; j < RADIX_PGD_SIZE_SHIFT; j++) pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE); pmap->pm_radix.rt_root = 0; TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = PMAP_PDE_SUPERPAGE; vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid); pmap->pm_pid = pid; l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1); mmu_radix_update_proctab(pid, l1pa); __asm __volatile("ptesync;isync" : : : "memory"); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, pdppg, pdpg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ - if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } - if ((m->flags & PG_ZERO) == 0) - mmu_radix_zero_page(m); + m->pindex = ptepindex; /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUPDE + NUPDPE)) { pml1_entry_t *l1e; vm_pindex_t pml1index; /* Wire up a new PDPE page */ pml1index = ptepindex - (NUPDE + NUPDPE); l1e = &pmap->pm_pml1[pml1index]; KASSERT((be64toh(*l1e) & PG_V) == 0, ("%s: L1 entry %#lx is valid", __func__, *l1e)); pde_store(l1e, VM_PAGE_TO_PHYS(m)); } else if (ptepindex >= NUPDE) { vm_pindex_t pml1index; vm_pindex_t pdpindex; pml1_entry_t *l1e; pml2_entry_t *l2e; /* Wire up a new l2e page */ pdpindex = ptepindex - NUPDE; pml1index = pdpindex >> RPTE_SHIFT; l1e = &pmap->pm_pml1[pml1index]; if ((be64toh(*l1e) & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to l2e page */ pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME); pdppg->ref_count++; } l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); /* Now find the pdp page */ l2e = &l2e[pdpindex & RPTE_MASK]; KASSERT((be64toh(*l2e) & PG_V) == 0, ("%s: L2 entry %#lx is valid", __func__, *l2e)); pde_store(l2e, VM_PAGE_TO_PHYS(m)); } else { vm_pindex_t pml1index; vm_pindex_t pdpindex; pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t *l3e; /* Wire up a new PTE page */ pdpindex = ptepindex >> RPTE_SHIFT; pml1index = pdpindex >> RPTE_SHIFT; /* First, find the pdp and check that its valid. */ l1e = &pmap->pm_pml1[pml1index]; if ((be64toh(*l1e) & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); l2e = &l2e[pdpindex & RPTE_MASK]; } else { l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); l2e = &l2e[pdpindex & RPTE_MASK]; if ((be64toh(*l2e) & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME); pdpg->ref_count++; } } l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME); /* Now we know where the page directory page is */ l3e = &l3e[ptepindex & RPTE_MASK]; KASSERT((be64toh(*l3e) & PG_V) == 0, ("%s: L3 entry %#lx is valid", __func__, *l3e)); pde_store(l3e, VM_PAGE_TO_PHYS(m)); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t pdpindex, ptepindex; pml2_entry_t *pdpe; vm_page_t pdpg; retry: pdpe = pmap_pml2e(pmap, va); if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME); pdpg->ref_count++; } else { /* Allocate a pd page. */ ptepindex = pmap_l3e_pindex(va); pdpindex = ptepindex >> RPTE_SHIFT; pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); if (pdpg == NULL && lockp != NULL) goto retry; } return (pdpg); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pml3_entry_t *pd; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_l3e_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pml3e(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) { if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. */ pd = NULL; } } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != NULL && (be64toh(*pd) & PG_V) != 0) { m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME); m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } static void mmu_radix_pinit0(pmap_t pmap) { CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); PMAP_LOCK_INIT(pmap); pmap->pm_pml1 = kernel_pmap->pm_pml1; pmap->pm_pid = kernel_pmap->pm_pid; pmap->pm_radix.rt_root = 0; TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); kernel_pmap->pm_flags = pmap->pm_flags = PMAP_PDE_SUPERPAGE; } /* * pmap_protect_l3e: do the things to protect a 2mpage in a process */ static boolean_t pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot) { pt_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L3_PAGE_MASK) == 0, ("pmap_protect_l3e: sva is not 2mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = be64toh(*l3e); if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { eva = sva + L3_PAGE_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) { newpde &= ~(PG_RW | PG_M); newpde |= RPTE_EAA_R; } if (prot & VM_PROT_EXECUTE) newpde |= PG_X; if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED))) goto retry; anychanged = TRUE; } return (anychanged); } void mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va_next; pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t ptpaddr, *l3e; pt_entry_t *pte; boolean_t anychanged; CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva, prot); KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { mmu_radix_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #ifdef INVARIANTS if (VERBOSE_PROTECT || pmap_logging) printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n", pmap, sva, eva, prot, pmap->pm_pid); #endif anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1e = pmap_pml1e(pmap, sva); if ((be64toh(*l1e) & PG_V) == 0) { va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } l2e = pmap_l1e_to_l2e(l1e, sva); if ((be64toh(*l2e) & PG_V) == 0) { va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (va_next < sva) va_next = eva; l3e = pmap_l2e_to_l3e(l2e, sva); ptpaddr = be64toh(*l3e); /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & RPTE_LEAF) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { if (pmap_protect_l3e(pmap, l3e, sva, prot)) anychanged = TRUE; continue; } else if (!pmap_demote_l3e(pmap, l3e, sva)) { /* * The large page mapping was destroyed. */ continue; } } if (va_next > eva) va_next = eva; for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pt_entry_t obits, pbits; vm_page_t m; retry: MPASS(pte == pmap_pte(pmap, sva)); obits = pbits = be64toh(*pte); if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); pbits |= RPTE_EAA_R; } if (prot & VM_PROT_EXECUTE) pbits |= PG_X; if (pbits != obits) { if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits))) goto retry; if (obits & (PG_A|PG_M)) { anychanged = TRUE; #ifdef INVARIANTS if (VERBOSE_PROTECT || pmap_logging) printf("%#lx %#lx -> %#lx\n", sva, obits, pbits); #endif } } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } void mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count) { CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count); pt_entry_t oldpte, pa, *pte; vm_page_t m; uint64_t cache_bits, attr_bits; vm_offset_t va; oldpte = 0; attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; va = sva; pte = kvtopte(va); while (va < sva + PAGE_SIZE * count) { if (__predict_false((va & L3_PAGE_MASK) == 0)) pte = kvtopte(va); MPASS(pte == pmap_pte(kernel_pmap, va)); /* * XXX there has to be a more efficient way than traversing * the page table every time - but go for correctness for * today */ m = *ma++; cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs); pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits; if (be64toh(*pte) != pa) { oldpte |= be64toh(*pte); pte_store(pte, pa); } va += PAGE_SIZE; pte++; } if (__predict_false((oldpte & RPTE_VALID) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); else ptesync(); } void mmu_radix_qremove(vm_offset_t sva, int count) { vm_offset_t va; pt_entry_t *pte; CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count); KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva)); va = sva; pte = kvtopte(va); while (va < sva + PAGE_SIZE * count) { if (__predict_false((va & L3_PAGE_MASK) == 0)) pte = kvtopte(va); pte_clear(pte); pte++; va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_radix, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va))); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= NUPDE + NUPDPE) { /* PDP page */ pml1_entry_t *pml1; pml1 = pmap_pml1e(pmap, va); *pml1 = 0; } else if (m->pindex >= NUPDE) { /* PD page */ pml2_entry_t *l2e; l2e = pmap_pml2e(pmap, va); *l2e = 0; } else { /* PTE page */ pml3_entry_t *l3e; l3e = pmap_pml3e(pmap, va); *l3e = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } else if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* We just released a PD, unhold the matching PDP */ vm_page_t pdppg; pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); } /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void mmu_radix_release(pmap_t pmap) { CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_radix), ("pmap_release: pmap has reserved page table page(s)")); pmap_invalidate_all(pmap); isa3_proctab[pmap->pm_pid].proctab0 = 0; uma_zfree(zone_radix_pgd, pmap->pm_pml1); vmem_free(asid_arena, pmap->pm_pid, 1); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = pde & PG_PS_FRAME; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); pvh->pv_gen++; return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = htobe64(newpte); newpte += PAGE_SIZE; } } static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va) { struct rwlock *lock; boolean_t rv; lock = NULL; rv = pmap_demote_l3e_locked(pmap, pde, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, struct rwlock **lockp) { pml3_entry_t oldpde; pt_entry_t *firstpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = be64toh(*l3e); KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx", oldpde)); if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_l3e: page table page for a wired mapping" " is missing")); /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed or the allocation of the new * page table page fails. If the 2MB page mapping belongs to * the direct map region of the kernel's address space, then * the page allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is * normal. Page table pages are preallocated for every other * part of the kernel address space, so the direct map region * is the only part of the kernel address space that must be * handled here. */ - if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, - pmap_l3e_pindex(va), (va >= DMAP_MIN_ADDRESS && va < - DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc_noobj( + (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS ? + VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_2mpage(va); pmap_remove_l3e(pmap, l3e, sva, &free, lockp); pmap_invalidate_l3e_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } + mpte->pindex = pmap_l3e_pindex(va); if (va < VM_MAXUSER_ADDRESS) pmap_resident_count_inc(pmap, 1); } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); KASSERT((oldpde & PG_A) != 0, ("pmap_demote_l3e: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_l3e: oldpde is missing PG_M")); /* * If the page table page is new, initialize it. */ if (mpte->ref_count == 1) { mpte->ref_count = NPTEPG; pmap_fill_ptp(firstpte, oldpde); } KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME), ("pmap_demote_l3e: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, oldpde); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the PDE and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_l3e() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldpde & PG_MANAGED) != 0) reserve_pv_entries(pmap, NPTEPG - 1, lockp); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ pde_store(l3e, mptepa); pmap_invalidate_l3e_page(pmap, trunc_2mpage(va), oldpde); /* * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp); counter_u64_add(pmap_l3e_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va) { vm_paddr_t mptepa; vm_page_t mpte; KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); /* * Initialize the page table page. */ pagezero(PHYS_TO_DMAP(mptepa)); /* * Demote the mapping. */ pde_store(l3e, mptepa); ptesync(); } /* * pmap_remove_l3e: do the things to unmap a superpage in a process */ static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pml3_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L3_PAGE_MASK) == 0, ("pmap_remove_l3e: sva is not 2mpage aligned")); oldpde = be64toh(pte_load_clear(pdq)); if (oldpde & PG_W) pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE); pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); if (oldpde & PG_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + L3_PAGE_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l3e(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { pmap_resident_count_dec(pmap, 1); KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_l3e: pte page wire count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free)); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldpte; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = be64toh(pte_load_clear(ptq)); if (oldpte & RPTE_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (oldpte & RPTE_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, ptepde, free)); } /* * Remove a single page from a process address space */ static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e, struct spglist *free) { struct rwlock *lock; pt_entry_t *pte; bool invalidate_all; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((be64toh(*l3e) & RPTE_VALID) == 0) { return (false); } pte = pmap_l3e_to_pte(l3e, va); if ((be64toh(*pte) & RPTE_VALID) == 0) { return (false); } lock = NULL; invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock); if (lock != NULL) rw_wunlock(lock); if (!invalidate_all) pmap_invalidate_page(pmap, va); return (invalidate_all); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp) { pt_entry_t *pte; vm_offset_t va; bool anyvalid; PMAP_LOCK_ASSERT(pmap, MA_OWNED); anyvalid = false; va = eva; for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++, sva += PAGE_SIZE) { MPASS(pte == pmap_pte(pmap, sva)); if (*pte == 0) { if (va != eva) { anyvalid = true; va = eva; } continue; } if (va == eva) va = sva; if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) { anyvalid = true; sva += PAGE_SIZE; break; } } if (anyvalid) pmap_invalidate_all(pmap); else if (va != eva) pmap_invalidate_range(pmap, va, sva); return (anyvalid); } void mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t ptpaddr, *l3e; struct spglist free; bool anyvalid; CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = false; SLIST_INIT(&free); /* XXX something fishy here */ sva = (sva + PAGE_MASK) & ~PAGE_MASK; eva = (eva + PAGE_MASK) & ~PAGE_MASK; PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { l3e = pmap_pml3e(pmap, sva); if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) { anyvalid = pmap_remove_page(pmap, sva, l3e, &free); goto out; } } lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l1e = pmap_pml1e(pmap, sva); if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) { va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } l2e = pmap_l1e_to_l2e(l1e, sva); if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) { va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (va_next < sva) va_next = eva; l3e = pmap_l2e_to_l3e(l2e, sva); ptpaddr = be64toh(*l3e); /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & RPTE_LEAF) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { pmap_remove_l3e(pmap, l3e, sva, &free, &lock); anyvalid = true; continue; } else if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { /* The large page mapping was destroyed. */ continue; } else ptpaddr = be64toh(*l3e); } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock)) anyvalid = true; } if (lock != NULL) rw_wunlock(lock); out: if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } void mmu_radix_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pt_entry_t *pte, tpte; pml3_entry_t *l3e; vm_offset_t va; struct spglist free; int pvh_gen, md_gen; CTR2(KTR_PMAP, "%s(%p)", __func__, m); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; l3e = pmap_pml3e(pmap, va); (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } pmap_resident_count_dec(pmap, 1); l3e = pmap_pml3e(pmap, pv->pv_va); KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found" " a 2mpage in page %p's pv list", m)); pte = pmap_l3e_to_pte(l3e, pv->pv_va); tpte = be64toh(pte_load_clear(pte)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); vm_page_free_pages_toq(&free, true); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. * * Although this function destroys all of the pmap's managed, * non-wired mappings, it can delay and batch the invalidation of TLB * entries without calling pmap_delayed_invl_started() and * pmap_delayed_invl_finished(). Because the pmap is not active on * any other processor, none of these TLB entries will ever be used * before their eventual invalidation. Consequently, there is no need * for either pmap_remove_all() or pmap_remove_write() to wait for * that eventual TLB invalidation. */ void mmu_radix_remove_pages(pmap_t pmap) { CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); pml3_entry_t ptel3e; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; boolean_t superpage; vm_paddr_t pa; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap->pm_pid == mfspr(SPR_PID), ("non-current asid %lu - expected %lu", pmap->pm_pid, mfspr(SPR_PID))); lock = NULL; SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = cnttzd(inuse); bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pml2e(pmap, pv->pv_va); ptel3e = be64toh(*pte); pte = pmap_l2e_to_l3e(pte, pv->pv_va); tpte = be64toh(*pte); if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) { superpage = FALSE; ptel3e = tpte; pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & PG_FRAME); pte = &pte[pmap_pte_index(pv->pv_va)]; tpte = be64toh(*pte); } else { /* * Keep track whether 'tpte' is a * superpage explicitly instead of * relying on RPTE_LEAF being set. * * This is because RPTE_LEAF is numerically * identical to PG_PTE_PAT and thus a * regular page could be mistaken for * a superpage. */ superpage = TRUE; } if ((tpte & PG_V) == 0) { panic("bad pte va %lx pte %lx", pv->pv_va, tpte); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } if (superpage) pa = tpte & PG_PS_FRAME; else pa = tpte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (superpage) { for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; if (superpage) { pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) if ((mt->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { pmap_resident_count_dec(pmap, 1); KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); #ifdef VERBOSE_PV printf("freeing pv (%p, %p)\n", pmap, pv); #endif TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; if ((m->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } void mmu_radix_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pml3_entry_t *l3e; pt_entry_t oldpte, *pte; int pvh_gen, md_gen; CTR2(KTR_PMAP, "%s(%p)", __func__, m); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } l3e = pmap_pml3e(pmap, pv->pv_va); if ((be64toh(*l3e) & PG_RW) != 0) (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } l3e = pmap_pml3e(pmap, pv->pv_va); KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_write: found a 2mpage in page %p's pv list", m)); pte = pmap_l3e_to_pte(l3e, pv->pv_va); retry: oldpte = be64toh(*pte); if (oldpte & PG_RW) { if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M)))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware * feature, so there is no need to invalidate any TLB entries. * Since pmap_demote_l3e() for the wired entry must never fail, * pmap_delayed_invl_started()/finished() calls around the * function are not needed. */ void mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t *l3e; pt_entry_t *pte; CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1e = pmap_pml1e(pmap, sva); if ((be64toh(*l1e) & PG_V) == 0) { va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } l2e = pmap_l1e_to_l2e(l1e, sva); if ((be64toh(*l2e) & PG_V) == 0) { va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (va_next < sva) va_next = eva; l3e = pmap_l2e_to_l3e(l2e, sva); if ((be64toh(*l3e) & PG_V) == 0) continue; if ((be64toh(*l3e) & RPTE_LEAF) != 0) { if ((be64toh(*l3e) & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)(be64toh(*l3e))); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { atomic_clear_long(l3e, htobe64(PG_W)); pmap->pm_stats.wired_count -= L3_PAGE_SIZE / PAGE_SIZE; continue; } else if (!pmap_demote_l3e(pmap, l3e, sva)) panic("pmap_unwire: demotion failed"); } if (va_next > eva) va_next = eva; for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, sva += PAGE_SIZE) { MPASS(pte == pmap_pte(pmap, sva)); if ((be64toh(*pte) & PG_V) == 0) continue; if ((be64toh(*pte) & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)(be64toh(*pte))); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(pte, htobe64(PG_W)); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } void mmu_radix_zero_page(vm_page_t m) { vm_offset_t addr; CTR2(KTR_PMAP, "%s(%p)", __func__, m); addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero(addr); } void mmu_radix_zero_page_area(vm_page_t m, int off, int size) { caddr_t addr; CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size); MPASS(off + size <= PAGE_SIZE); addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); memset(addr + off, 0, size); } static int mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pml3_entry_t *l3ep; pt_entry_t pte; vm_paddr_t pa; int val; CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); PMAP_LOCK(pmap); l3ep = pmap_pml3e(pmap, addr); if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) { if (be64toh(*l3ep) & RPTE_LEAF) { pte = be64toh(*l3ep); /* Compute the physical address of the 4KB page. */ pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) & PG_FRAME; val = MINCORE_PSIND(1); } else { /* Native endian PTE, do not pass to functions */ pte = be64toh(*pmap_l3e_to_pte(l3ep, addr)); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { *locked_pa = pa; } PMAP_UNLOCK(pmap); return (val); } void mmu_radix_activate(struct thread *td) { pmap_t pmap; uint32_t curpid; CTR2(KTR_PMAP, "%s(%p)", __func__, td); critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); curpid = mfspr(SPR_PID); if (pmap->pm_pid > isa3_base_pid && curpid != pmap->pm_pid) { mmu_radix_pid_set(pmap); } critical_exit(); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr, size); vm_offset_t superpage_offset; if (size < L3_PAGE_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L3_PAGE_MASK; if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE || (*addr & L3_PAGE_MASK) == superpage_offset) return; if ((*addr & L3_PAGE_MASK) < superpage_offset) *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset; else *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset; } static void * mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr) { vm_offset_t va, tmpva, ppa, offset; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); if (pa < powerpc_ptob(Maxmem)) panic("bad pa: %#lx less than Maxmem %#lx\n", pa, powerpc_ptob(Maxmem)); va = kva_alloc(size); if (bootverbose) printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr); KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr)); if (!va) panic("%s: Couldn't alloc kernel virtual memory", __func__); for (tmpva = va; size > 0;) { mmu_radix_kenter_attr(tmpva, ppa, attr); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } ptesync(); return ((void *)(va + offset)); } static void * mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size) { CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT)); } void mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma) { CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); m->md.mdpg_cache_attrs = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.mdpg_cache_attrs)) panic("memory attribute change on the direct map failed"); } static void mmu_radix_unmapdev(vm_offset_t va, vm_size_t size) { vm_offset_t offset; CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size); /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); if (pmap_initialized) { mmu_radix_qremove(va, atop(size)); kva_free(va, size); } } static __inline void pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask) { uint64_t opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = be64toh(*pte); npte = opte & ~mask; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte))); } /* * Tries to demote a 1GB page mapping. */ static boolean_t pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va) { pml2_entry_t oldpdpe; pml3_entry_t *firstpde, newpde, *pde; vm_paddr_t pdpgpa; vm_page_t pdpg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpdpe = be64toh(*l2e); KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); - pdpg = vm_page_alloc(NULL, va >> L2_PAGE_SIZE_SHIFT, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); + pdpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); if (pdpg == NULL) { CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } + pdpg->pindex = va >> L2_PAGE_SIZE_SHIFT; pdpgpa = VM_PAGE_TO_PHYS(pdpg); firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa); KASSERT((oldpdpe & PG_A) != 0, ("pmap_demote_pdpe: oldpdpe is missing PG_A")); KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pdpe: oldpdpe is missing PG_M")); newpde = oldpdpe; /* * Initialize the page directory page. */ for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { *pde = htobe64(newpde); newpde += L3_PAGE_SIZE; } /* * Demote the mapping. */ pde_store(l2e, pdpgpa); /* * Flush PWC --- XXX revisit */ pmap_invalidate_all(pmap); counter_u64_add(pmap_l2e_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } vm_paddr_t mmu_radix_kextract(vm_offset_t va) { pml3_entry_t l3e; vm_paddr_t pa; CTR2(KTR_PMAP, "%s(%#x)", __func__, va); if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { /* Big-endian PTE on stack */ l3e = *pmap_pml3e(kernel_pmap, va); if (be64toh(l3e) & RPTE_LEAF) { pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK); pa |= (va & L3_PAGE_MASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pa = be64toh(*pmap_l3e_to_pte(&l3e, va)); pa = (pa & PG_FRAME) | (va & PAGE_MASK); pa |= (va & PAGE_MASK); } } return (pa); } static pt_entry_t mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { if (ma != VM_MEMATTR_DEFAULT) { return pmap_cache_bits(ma); } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ for (int i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) return (RPTE_ATTR_MEM); } return (RPTE_ATTR_GUARDEDIO); } static void mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { pt_entry_t *pte, pteval; uint64_t cache_bits; pte = kvtopte(va); MPASS(pte != NULL); pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; cache_bits = mmu_radix_calc_wimg(pa, ma); pte_store(pte, pteval | cache_bits); } void mmu_radix_kremove(vm_offset_t va) { pt_entry_t *pte; CTR2(KTR_PMAP, "%s(%#x)", __func__, va); pte = kvtopte(va); pte_clear(pte); } int mmu_radix_decode_kernel_ptr(vm_offset_t addr, int *is_user, vm_offset_t *decoded) { CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr); *decoded = addr; *is_user = (addr < VM_MAXUSER_ADDRESS); return (0); } static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) { CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); return (mem_valid(pa, size)); } static void mmu_radix_scan_init() { CTR1(KTR_PMAP, "%s()", __func__); UNIMPLEMENTED(); } static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va) { CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); UNIMPLEMENTED(); } vm_offset_t mmu_radix_quick_enter_page(vm_page_t m) { vm_paddr_t paddr; CTR2(KTR_PMAP, "%s(%p)", __func__, m); paddr = VM_PAGE_TO_PHYS(m); return (PHYS_TO_DMAP(paddr)); } void mmu_radix_quick_remove_page(vm_offset_t addr __unused) { /* no work to do here */ CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); } static void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) { cpu_flush_dcache((void *)sva, eva - sva); } int mmu_radix_change_attr(vm_offset_t va, vm_size_t size, vm_memattr_t mode) { int error; CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode); PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode, true); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush) { vm_offset_t base, offset, tmpva; vm_paddr_t pa_start, pa_end, pa_end1; pml2_entry_t *l2e; pml3_entry_t *l3e; pt_entry_t *pte; int cache_bits, error; boolean_t changed; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ if (base < DMAP_MIN_ADDRESS) return (EINVAL); cache_bits = pmap_cache_bits(mode); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down 2MB pages * into 4KB pages if required. */ for (tmpva = base; tmpva < base + size; ) { l2e = pmap_pml2e(kernel_pmap, tmpva); if (l2e == NULL || *l2e == 0) return (EINVAL); if (be64toh(*l2e) & RPTE_LEAF) { /* * If the current 1GB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) { tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; continue; } /* * If the current offset aligns with a 1GB page frame * and there is at least 1GB left within the range, then * we need not break down this page into 2MB pages. */ if ((tmpva & L2_PAGE_MASK) == 0 && tmpva + L2_PAGE_MASK < base + size) { tmpva += L2_PAGE_MASK; continue; } if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva)) return (ENOMEM); } l3e = pmap_l2e_to_l3e(l2e, tmpva); KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n", tmpva, l2e)); if (*l3e == 0) return (EINVAL); if (be64toh(*l3e) & RPTE_LEAF) { /* * If the current 2MB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) { tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; continue; } /* * If the current offset aligns with a 2MB page frame * and there is at least 2MB left within the range, then * we need not break down this page into 4KB pages. */ if ((tmpva & L3_PAGE_MASK) == 0 && tmpva + L3_PAGE_MASK < base + size) { tmpva += L3_PAGE_SIZE; continue; } if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva)) return (ENOMEM); } pte = pmap_l3e_to_pte(l3e, tmpva); if (*pte == 0) return (EINVAL); tmpva += PAGE_SIZE; } error = 0; /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ pa_start = pa_end = 0; for (tmpva = base; tmpva < base + size; ) { l2e = pmap_pml2e(kernel_pmap, tmpva); if (be64toh(*l2e) & RPTE_LEAF) { if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) { pmap_pte_attr(l2e, cache_bits, RPTE_ATTR_MASK); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*l2e & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = be64toh(*l2e) & PG_PS_FRAME; pa_end = pa_start + L2_PAGE_SIZE; } else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME)) pa_end += L2_PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flush); if (error != 0) break; /* Start physical address run. */ pa_start = be64toh(*l2e) & PG_PS_FRAME; pa_end = pa_start + L2_PAGE_SIZE; } } tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; continue; } l3e = pmap_l2e_to_l3e(l2e, tmpva); if (be64toh(*l3e) & RPTE_LEAF) { if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) { pmap_pte_attr(l3e, cache_bits, RPTE_ATTR_MASK); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = be64toh(*l3e) & PG_PS_FRAME; pa_end = pa_start + L3_PAGE_SIZE; } else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME)) pa_end += L3_PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flush); if (error != 0) break; /* Start physical address run. */ pa_start = be64toh(*l3e) & PG_PS_FRAME; pa_end = pa_start + L3_PAGE_SIZE; } } tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; } else { pte = pmap_l3e_to_pte(l3e, tmpva); if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) { pmap_pte_attr(pte, cache_bits, RPTE_ATTR_MASK); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (be64toh(*pte) & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = be64toh(*pte) & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } else if (pa_end == (be64toh(*pte) & PG_FRAME)) pa_end += PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flush); if (error != 0) break; /* Start physical address run. */ pa_start = be64toh(*pte) & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } } tmpva += PAGE_SIZE; } } if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { pa_end1 = MIN(pa_end, dmaplimit); if (pa_start != pa_end1) error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), pa_end1 - pa_start, mode, flush); } /* * Flush CPU caches if required to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_all(kernel_pmap); if (flush) pmap_invalidate_cache_range(base, tmpva); } return (error); } /* * Allocate physical memory for the vm_page array and map it into KVA, * attempting to back the vm_pages with domain-local memory. */ void mmu_radix_page_array_startup(long pages) { #ifdef notyet pml2_entry_t *l2e; pml3_entry_t *pde; pml3_entry_t newl3; vm_offset_t va; long pfn; int domain, i; #endif vm_paddr_t pa; vm_offset_t start, end; vm_page_array_size = pages; start = VM_MIN_KERNEL_ADDRESS; end = start + pages * sizeof(struct vm_page); pa = vm_phys_early_alloc(0, end - start); start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT); #ifdef notyet /* TODO: NUMA vm_page_array. Blocked out until then (copied from amd64). */ for (va = start; va < end; va += L3_PAGE_SIZE) { pfn = first_page + (va - start) / sizeof(struct vm_page); domain = vm_phys_domain(ptoa(pfn)); l2e = pmap_pml2e(kernel_pmap, va); if ((be64toh(*l2e) & PG_V) == 0) { pa = vm_phys_early_alloc(domain, PAGE_SIZE); dump_add_page(pa); pagezero(PHYS_TO_DMAP(pa)); pde_store(l2e, (pml2_entry_t)pa); } pde = pmap_l2e_to_l3e(l2e, va); if ((be64toh(*pde) & PG_V) != 0) panic("Unexpected pde %p", pde); pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE); for (i = 0; i < NPDEPG; i++) dump_add_page(pa + i * PAGE_SIZE); newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W); pte_store(pde, newl3); } #endif vm_page_array = (vm_page_t)start; } #ifdef DDB #include #include static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va) { pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t *l3e; pt_entry_t *pte; l1e = &l1[pmap_pml1e_index(va)]; db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e)); if ((be64toh(*l1e) & PG_V) == 0) { db_printf("\n"); return; } l2e = pmap_l1e_to_l2e(l1e, va); db_printf(" l2e %#016lx", be64toh(*l2e)); if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) { db_printf("\n"); return; } l3e = pmap_l2e_to_l3e(l2e, va); db_printf(" l3e %#016lx", be64toh(*l3e)); if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) { db_printf("\n"); return; } pte = pmap_l3e_to_pte(l3e, va); db_printf(" pte %#016lx\n", be64toh(*pte)); } void pmap_page_print_mappings(vm_page_t m) { pmap_t pmap; pv_entry_t pv; db_printf("page %p(%lx)\n", m, m->phys_addr); /* need to elide locks if running in ddb */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { db_printf("pv: %p ", pv); db_printf("va: %#016lx ", pv->pv_va); pmap = PV_PMAP(pv); db_printf("pmap %p ", pmap); if (pmap != NULL) { db_printf("asid: %lu\n", pmap->pm_pid); pmap_pte_walk(pmap->pm_pml1, pv->pv_va); } } } DB_SHOW_COMMAND(pte, pmap_print_pte) { vm_offset_t va; pmap_t pmap; if (!have_addr) { db_printf("show pte addr\n"); return; } va = (vm_offset_t)addr; if (va >= DMAP_MIN_ADDRESS) pmap = kernel_pmap; else if (kdb_thread != NULL) pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); else pmap = vmspace_pmap(curthread->td_proc->p_vmspace); pmap_pte_walk(pmap->pm_pml1, va); } #endif diff --git a/sys/powerpc/booke/pmap_32.c b/sys/powerpc/booke/pmap_32.c index 924eb223a2b6..4098c19f6e00 100644 --- a/sys/powerpc/booke/pmap_32.c +++ b/sys/powerpc/booke/pmap_32.c @@ -1,1003 +1,1003 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2020 Justin Hibbits * Copyright (C) 2007-2009 Semihalf, Rafal Jaworowski * Copyright (C) 2006 Semihalf, Marian Balakowicz * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Some hw specific parts of this pmap were derived or influenced * by NetBSD's ibm4xx pmap module. More generic code is shared with * a few other pmap modules from the FreeBSD tree. */ /* * VM layout notes: * * Kernel and user threads run within one common virtual address space * defined by AS=0. * * 32-bit pmap: * Virtual address space layout: * ----------------------------- * 0x0000_0000 - 0x7fff_ffff : user process * 0x8000_0000 - 0xbfff_ffff : pmap_mapdev()-ed area (PCI/PCIE etc.) * 0xc000_0000 - 0xffff_efff : KVA */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PRI0ptrX "08x" /* Reserved KVA space and mutex for mmu_booke_zero_page. */ static vm_offset_t zero_page_va; static struct mtx zero_page_mutex; /* Reserved KVA space and mutex for mmu_booke_copy_page. */ static vm_offset_t copy_page_src_va; static vm_offset_t copy_page_dst_va; static struct mtx copy_page_mutex; static vm_offset_t kernel_ptbl_root; static unsigned int kernel_ptbls; /* Number of KVA ptbls. */ /**************************************************************************/ /* PMAP */ /**************************************************************************/ #define VM_MAPDEV_BASE ((vm_offset_t)VM_MAXUSER_ADDRESS + PAGE_SIZE) static void tid_flush(tlbtid_t tid); static unsigned long ilog2(unsigned long); /**************************************************************************/ /* Page table management */ /**************************************************************************/ #define PMAP_ROOT_SIZE (sizeof(pte_t**) * PDIR_NENTRIES) static void ptbl_init(void); static struct ptbl_buf *ptbl_buf_alloc(void); static void ptbl_buf_free(struct ptbl_buf *); static void ptbl_free_pmap_ptbl(pmap_t, pte_t *); static pte_t *ptbl_alloc(pmap_t, unsigned int, boolean_t); static void ptbl_free(pmap_t, unsigned int); static void ptbl_hold(pmap_t, unsigned int); static int ptbl_unhold(pmap_t, unsigned int); static vm_paddr_t pte_vatopa(pmap_t, vm_offset_t); static int pte_enter(pmap_t, vm_page_t, vm_offset_t, uint32_t, boolean_t); static int pte_remove(pmap_t, vm_offset_t, uint8_t); static pte_t *pte_find(pmap_t, vm_offset_t); struct ptbl_buf { TAILQ_ENTRY(ptbl_buf) link; /* list link */ vm_offset_t kva; /* va of mapping */ }; /* Number of kva ptbl buffers, each covering one ptbl (PTBL_PAGES). */ #define PTBL_BUFS (128 * 16) /* ptbl free list and a lock used for access synchronization. */ static TAILQ_HEAD(, ptbl_buf) ptbl_buf_freelist; static struct mtx ptbl_buf_freelist_lock; /* Base address of kva space allocated fot ptbl bufs. */ static vm_offset_t ptbl_buf_pool_vabase; /* Pointer to ptbl_buf structures. */ static struct ptbl_buf *ptbl_bufs; /**************************************************************************/ /* Page table related */ /**************************************************************************/ /* Initialize pool of kva ptbl buffers. */ static void ptbl_init(void) { int i; CTR3(KTR_PMAP, "%s: s (ptbl_bufs = 0x%08x size 0x%08x)", __func__, (uint32_t)ptbl_bufs, sizeof(struct ptbl_buf) * PTBL_BUFS); CTR3(KTR_PMAP, "%s: s (ptbl_buf_pool_vabase = 0x%08x size = 0x%08x)", __func__, ptbl_buf_pool_vabase, PTBL_BUFS * PTBL_PAGES * PAGE_SIZE); mtx_init(&ptbl_buf_freelist_lock, "ptbl bufs lock", NULL, MTX_DEF); TAILQ_INIT(&ptbl_buf_freelist); for (i = 0; i < PTBL_BUFS; i++) { ptbl_bufs[i].kva = ptbl_buf_pool_vabase + i * PTBL_PAGES * PAGE_SIZE; TAILQ_INSERT_TAIL(&ptbl_buf_freelist, &ptbl_bufs[i], link); } } /* Get a ptbl_buf from the freelist. */ static struct ptbl_buf * ptbl_buf_alloc(void) { struct ptbl_buf *buf; mtx_lock(&ptbl_buf_freelist_lock); buf = TAILQ_FIRST(&ptbl_buf_freelist); if (buf != NULL) TAILQ_REMOVE(&ptbl_buf_freelist, buf, link); mtx_unlock(&ptbl_buf_freelist_lock); CTR2(KTR_PMAP, "%s: buf = %p", __func__, buf); return (buf); } /* Return ptbl buff to free pool. */ static void ptbl_buf_free(struct ptbl_buf *buf) { CTR2(KTR_PMAP, "%s: buf = %p", __func__, buf); mtx_lock(&ptbl_buf_freelist_lock); TAILQ_INSERT_TAIL(&ptbl_buf_freelist, buf, link); mtx_unlock(&ptbl_buf_freelist_lock); } /* * Search the list of allocated ptbl bufs and find on list of allocated ptbls */ static void ptbl_free_pmap_ptbl(pmap_t pmap, pte_t *ptbl) { struct ptbl_buf *pbuf; CTR2(KTR_PMAP, "%s: ptbl = %p", __func__, ptbl); PMAP_LOCK_ASSERT(pmap, MA_OWNED); TAILQ_FOREACH(pbuf, &pmap->pm_ptbl_list, link) if (pbuf->kva == (vm_offset_t)ptbl) { /* Remove from pmap ptbl buf list. */ TAILQ_REMOVE(&pmap->pm_ptbl_list, pbuf, link); /* Free corresponding ptbl buf. */ ptbl_buf_free(pbuf); break; } } /* Allocate page table. */ static pte_t * ptbl_alloc(pmap_t pmap, unsigned int pdir_idx, boolean_t nosleep) { vm_page_t mtbl[PTBL_PAGES]; vm_page_t m; struct ptbl_buf *pbuf; unsigned int pidx; pte_t *ptbl; int i, j; CTR4(KTR_PMAP, "%s: pmap = %p su = %d pdir_idx = %d", __func__, pmap, (pmap == kernel_pmap), pdir_idx); KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)), ("ptbl_alloc: invalid pdir_idx")); KASSERT((pmap->pm_pdir[pdir_idx] == NULL), ("pte_alloc: valid ptbl entry exists!")); pbuf = ptbl_buf_alloc(); if (pbuf == NULL) panic("pte_alloc: couldn't alloc kernel virtual memory"); ptbl = (pte_t *)pbuf->kva; CTR2(KTR_PMAP, "%s: ptbl kva = %p", __func__, ptbl); for (i = 0; i < PTBL_PAGES; i++) { pidx = (PTBL_PAGES * pdir_idx) + i; - while ((m = vm_page_alloc(NULL, pidx, - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + while ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { if (nosleep) { ptbl_free_pmap_ptbl(pmap, ptbl); for (j = 0; j < i; j++) vm_page_free(mtbl[j]); vm_wire_sub(i); return (NULL); } PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); vm_wait(NULL); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } + m->pindex = pidx; mtbl[i] = m; } /* Map allocated pages into kernel_pmap. */ mmu_booke_qenter((vm_offset_t)ptbl, mtbl, PTBL_PAGES); /* Zero whole ptbl. */ bzero((caddr_t)ptbl, PTBL_PAGES * PAGE_SIZE); /* Add pbuf to the pmap ptbl bufs list. */ TAILQ_INSERT_TAIL(&pmap->pm_ptbl_list, pbuf, link); return (ptbl); } /* Free ptbl pages and invalidate pdir entry. */ static void ptbl_free(pmap_t pmap, unsigned int pdir_idx) { pte_t *ptbl; vm_paddr_t pa; vm_offset_t va; vm_page_t m; int i; CTR4(KTR_PMAP, "%s: pmap = %p su = %d pdir_idx = %d", __func__, pmap, (pmap == kernel_pmap), pdir_idx); KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)), ("ptbl_free: invalid pdir_idx")); ptbl = pmap->pm_pdir[pdir_idx]; CTR2(KTR_PMAP, "%s: ptbl = %p", __func__, ptbl); KASSERT((ptbl != NULL), ("ptbl_free: null ptbl")); /* * Invalidate the pdir entry as soon as possible, so that other CPUs * don't attempt to look up the page tables we are releasing. */ mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); pmap->pm_pdir[pdir_idx] = NULL; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); for (i = 0; i < PTBL_PAGES; i++) { va = ((vm_offset_t)ptbl + (i * PAGE_SIZE)); pa = pte_vatopa(kernel_pmap, va); m = PHYS_TO_VM_PAGE(pa); vm_page_free_zero(m); vm_wire_sub(1); mmu_booke_kremove(va); } ptbl_free_pmap_ptbl(pmap, ptbl); } /* * Decrement ptbl pages hold count and attempt to free ptbl pages. * Called when removing pte entry from ptbl. * * Return 1 if ptbl pages were freed. */ static int ptbl_unhold(pmap_t pmap, unsigned int pdir_idx) { pte_t *ptbl; vm_paddr_t pa; vm_page_t m; int i; CTR4(KTR_PMAP, "%s: pmap = %p su = %d pdir_idx = %d", __func__, pmap, (pmap == kernel_pmap), pdir_idx); KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)), ("ptbl_unhold: invalid pdir_idx")); KASSERT((pmap != kernel_pmap), ("ptbl_unhold: unholding kernel ptbl!")); ptbl = pmap->pm_pdir[pdir_idx]; //debugf("ptbl_unhold: ptbl = 0x%08x\n", (u_int32_t)ptbl); KASSERT(((vm_offset_t)ptbl >= VM_MIN_KERNEL_ADDRESS), ("ptbl_unhold: non kva ptbl")); /* decrement hold count */ for (i = 0; i < PTBL_PAGES; i++) { pa = pte_vatopa(kernel_pmap, (vm_offset_t)ptbl + (i * PAGE_SIZE)); m = PHYS_TO_VM_PAGE(pa); m->ref_count--; } /* * Free ptbl pages if there are no pte etries in this ptbl. * ref_count has the same value for all ptbl pages, so check the last * page. */ if (m->ref_count == 0) { ptbl_free(pmap, pdir_idx); //debugf("ptbl_unhold: e (freed ptbl)\n"); return (1); } return (0); } /* * Increment hold count for ptbl pages. This routine is used when a new pte * entry is being inserted into the ptbl. */ static void ptbl_hold(pmap_t pmap, unsigned int pdir_idx) { vm_paddr_t pa; pte_t *ptbl; vm_page_t m; int i; CTR3(KTR_PMAP, "%s: pmap = %p pdir_idx = %d", __func__, pmap, pdir_idx); KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)), ("ptbl_hold: invalid pdir_idx")); KASSERT((pmap != kernel_pmap), ("ptbl_hold: holding kernel ptbl!")); ptbl = pmap->pm_pdir[pdir_idx]; KASSERT((ptbl != NULL), ("ptbl_hold: null ptbl")); for (i = 0; i < PTBL_PAGES; i++) { pa = pte_vatopa(kernel_pmap, (vm_offset_t)ptbl + (i * PAGE_SIZE)); m = PHYS_TO_VM_PAGE(pa); m->ref_count++; } } /* * Clean pte entry, try to free page table page if requested. * * Return 1 if ptbl pages were freed, otherwise return 0. */ static int pte_remove(pmap_t pmap, vm_offset_t va, uint8_t flags) { unsigned int pdir_idx = PDIR_IDX(va); unsigned int ptbl_idx = PTBL_IDX(va); vm_page_t m; pte_t *ptbl; pte_t *pte; //int su = (pmap == kernel_pmap); //debugf("pte_remove: s (su = %d pmap = 0x%08x va = 0x%08x flags = %d)\n", // su, (u_int32_t)pmap, va, flags); ptbl = pmap->pm_pdir[pdir_idx]; KASSERT(ptbl, ("pte_remove: null ptbl")); pte = &ptbl[ptbl_idx]; if (pte == NULL || !PTE_ISVALID(pte)) return (0); if (PTE_ISWIRED(pte)) pmap->pm_stats.wired_count--; /* Get vm_page_t for mapped pte. */ m = PHYS_TO_VM_PAGE(PTE_PA(pte)); /* Handle managed entry. */ if (PTE_ISMANAGED(pte)) { if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); if (PTE_ISREFERENCED(pte)) vm_page_aflag_set(m, PGA_REFERENCED); pv_remove(pmap, va, m); } else if (pmap == kernel_pmap && m && m->md.pv_tracked) { /* * Always pv_insert()/pv_remove() on MPC85XX, in case DPAA is * used. This is needed by the NCSW support code for fast * VA<->PA translation. */ pv_remove(pmap, va, m); if (TAILQ_EMPTY(&m->md.pv_list)) m->md.pv_tracked = false; } mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte = 0; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); pmap->pm_stats.resident_count--; if (flags & PTBL_UNHOLD) { //debugf("pte_remove: e (unhold)\n"); return (ptbl_unhold(pmap, pdir_idx)); } //debugf("pte_remove: e\n"); return (0); } /* * Insert PTE for a given page and virtual address. */ static int pte_enter(pmap_t pmap, vm_page_t m, vm_offset_t va, uint32_t flags, boolean_t nosleep) { unsigned int pdir_idx = PDIR_IDX(va); unsigned int ptbl_idx = PTBL_IDX(va); pte_t *ptbl, *pte, pte_tmp; CTR4(KTR_PMAP, "%s: su = %d pmap = %p va = %p", __func__, pmap == kernel_pmap, pmap, va); /* Get the page table pointer. */ ptbl = pmap->pm_pdir[pdir_idx]; if (ptbl == NULL) { /* Allocate page table pages. */ ptbl = ptbl_alloc(pmap, pdir_idx, nosleep); if (ptbl == NULL) { KASSERT(nosleep, ("nosleep and NULL ptbl")); return (ENOMEM); } pmap->pm_pdir[pdir_idx] = ptbl; pte = &ptbl[ptbl_idx]; } else { /* * Check if there is valid mapping for requested * va, if there is, remove it. */ pte = &pmap->pm_pdir[pdir_idx][ptbl_idx]; if (PTE_ISVALID(pte)) { pte_remove(pmap, va, PTBL_HOLD); } else { /* * pte is not used, increment hold count * for ptbl pages. */ if (pmap != kernel_pmap) ptbl_hold(pmap, pdir_idx); } } /* * Insert pv_entry into pv_list for mapped page if part of managed * memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { flags |= PTE_MANAGED; /* Create and insert pv entry. */ pv_insert(pmap, va, m); } pmap->pm_stats.resident_count++; pte_tmp = PTE_RPN_FROM_PA(VM_PAGE_TO_PHYS(m)); pte_tmp |= (PTE_VALID | flags | PTE_PS_4KB); /* 4KB pages only */ mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte = pte_tmp; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); return (0); } /* Return the pa for the given pmap/va. */ static vm_paddr_t pte_vatopa(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa = 0; pte_t *pte; pte = pte_find(pmap, va); if ((pte != NULL) && PTE_ISVALID(pte)) pa = (PTE_PA(pte) | (va & PTE_PA_MASK)); return (pa); } /* Get a pointer to a PTE in a page table. */ static pte_t * pte_find(pmap_t pmap, vm_offset_t va) { unsigned int pdir_idx = PDIR_IDX(va); unsigned int ptbl_idx = PTBL_IDX(va); KASSERT((pmap != NULL), ("pte_find: invalid pmap")); if (pmap->pm_pdir[pdir_idx]) return (&(pmap->pm_pdir[pdir_idx][ptbl_idx])); return (NULL); } /* Get a pointer to a PTE in a page table, or the next closest (greater) one. */ static __inline pte_t * pte_find_next(pmap_t pmap, vm_offset_t *pva) { vm_offset_t va; pte_t **pdir; pte_t *pte; unsigned long i, j; KASSERT((pmap != NULL), ("pte_find: invalid pmap")); va = *pva; i = PDIR_IDX(va); j = PTBL_IDX(va); pdir = pmap->pm_pdir; for (; i < PDIR_NENTRIES; i++, j = 0) { if (pdir[i] == NULL) continue; for (; j < PTBL_NENTRIES; j++) { pte = &pdir[i][j]; if (!PTE_ISVALID(pte)) continue; *pva = PDIR_SIZE * i + PAGE_SIZE * j; return (pte); } } return (NULL); } /* Set up kernel page tables. */ static void kernel_pte_alloc(vm_offset_t data_end, vm_offset_t addr) { pte_t *pte; vm_offset_t va; vm_offset_t pdir_start; int i; kptbl_min = VM_MIN_KERNEL_ADDRESS / PDIR_SIZE; kernel_pmap->pm_pdir = (pte_t **)kernel_ptbl_root; pdir_start = kernel_ptbl_root + PDIR_NENTRIES * sizeof(pte_t); /* Initialize kernel pdir */ for (i = 0; i < kernel_ptbls; i++) { kernel_pmap->pm_pdir[kptbl_min + i] = (pte_t *)(pdir_start + (i * PAGE_SIZE * PTBL_PAGES)); } /* * Fill in PTEs covering kernel code and data. They are not required * for address translation, as this area is covered by static TLB1 * entries, but for pte_vatopa() to work correctly with kernel area * addresses. */ for (va = addr; va < data_end; va += PAGE_SIZE) { pte = &(kernel_pmap->pm_pdir[PDIR_IDX(va)][PTBL_IDX(va)]); powerpc_sync(); *pte = PTE_RPN_FROM_PA(kernload + (va - kernstart)); *pte |= PTE_M | PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID | PTE_PS_4KB; } } static vm_offset_t mmu_booke_alloc_kernel_pgtables(vm_offset_t data_end) { /* Allocate space for ptbl_bufs. */ ptbl_bufs = (struct ptbl_buf *)data_end; data_end += sizeof(struct ptbl_buf) * PTBL_BUFS; debugf(" ptbl_bufs at 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n", (uintptr_t)ptbl_bufs, data_end); data_end = round_page(data_end); kernel_ptbl_root = data_end; data_end += PDIR_NENTRIES * sizeof(pte_t*); /* Allocate PTE tables for kernel KVA. */ kernel_ptbls = howmany(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, PDIR_SIZE); data_end += kernel_ptbls * PTBL_PAGES * PAGE_SIZE; debugf(" kernel ptbls: %d\n", kernel_ptbls); debugf(" kernel pdir at %#jx end = %#jx\n", (uintmax_t)kernel_ptbl_root, (uintmax_t)data_end); return (data_end); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ static int mmu_booke_pinit(pmap_t pmap) { int i; CTR4(KTR_PMAP, "%s: pmap = %p, proc %d '%s'", __func__, pmap, curthread->td_proc->p_pid, curthread->td_proc->p_comm); KASSERT((pmap != kernel_pmap), ("pmap_pinit: initializing kernel_pmap")); for (i = 0; i < MAXCPU; i++) pmap->pm_tid[i] = TID_NONE; CPU_ZERO(&kernel_pmap->pm_active); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_pdir = uma_zalloc(ptbl_root_zone, M_WAITOK); bzero(pmap->pm_pdir, sizeof(pte_t *) * PDIR_NENTRIES); TAILQ_INIT(&pmap->pm_ptbl_list); return (1); } /* * Release any resources held by the given physical map. * Called when a pmap initialized by mmu_booke_pinit is being released. * Should only be called if the map contains no valid mappings. */ static void mmu_booke_release(pmap_t pmap) { KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); uma_zfree(ptbl_root_zone, pmap->pm_pdir); } static void mmu_booke_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { pte_t *pte; vm_paddr_t pa = 0; int sync_sz, valid; pmap_t pmap; vm_page_t m; vm_offset_t addr; int active; rw_wlock(&pvh_global_lock); pmap = PCPU_GET(curpmap); active = (pm == kernel_pmap || pm == pmap) ? 1 : 0; while (sz > 0) { PMAP_LOCK(pm); pte = pte_find(pm, va); valid = (pte != NULL && PTE_ISVALID(pte)) ? 1 : 0; if (valid) pa = PTE_PA(pte); PMAP_UNLOCK(pm); sync_sz = PAGE_SIZE - (va & PAGE_MASK); sync_sz = min(sync_sz, sz); if (valid) { if (!active) { /* * Create a mapping in the active pmap. * * XXX: We use the zero page here, because * it isn't likely to be in use. * If we ever decide to support * security.bsd.map_at_zero on Book-E, change * this to some other address that isn't * normally mappable. */ addr = 0; m = PHYS_TO_VM_PAGE(pa); PMAP_LOCK(pmap); pte_enter(pmap, m, addr, PTE_SR | PTE_VALID, FALSE); __syncicache((void *)(addr + (va & PAGE_MASK)), sync_sz); pte_remove(pmap, addr, PTBL_UNHOLD); PMAP_UNLOCK(pmap); } else __syncicache((void *)va, sync_sz); } va += sync_sz; sz -= sync_sz; } rw_wunlock(&pvh_global_lock); } /* * mmu_booke_zero_page_area zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. * * off and size must reside within a single page. */ static void mmu_booke_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va; /* XXX KASSERT off and size are within a single page? */ mtx_lock(&zero_page_mutex); va = zero_page_va; mmu_booke_kenter(va, VM_PAGE_TO_PHYS(m)); bzero((caddr_t)va + off, size); mmu_booke_kremove(va); mtx_unlock(&zero_page_mutex); } /* * mmu_booke_zero_page zeros the specified hardware page. */ static void mmu_booke_zero_page(vm_page_t m) { vm_offset_t off, va; va = zero_page_va; mtx_lock(&zero_page_mutex); mmu_booke_kenter(va, VM_PAGE_TO_PHYS(m)); for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); mmu_booke_kremove(va); mtx_unlock(&zero_page_mutex); } /* * mmu_booke_copy_page copies the specified (machine independent) page by * mapping the page into virtual memory and using memcopy to copy the page, * one machine dependent page at a time. */ static void mmu_booke_copy_page(vm_page_t sm, vm_page_t dm) { vm_offset_t sva, dva; sva = copy_page_src_va; dva = copy_page_dst_va; mtx_lock(©_page_mutex); mmu_booke_kenter(sva, VM_PAGE_TO_PHYS(sm)); mmu_booke_kenter(dva, VM_PAGE_TO_PHYS(dm)); memcpy((caddr_t)dva, (caddr_t)sva, PAGE_SIZE); mmu_booke_kremove(dva); mmu_booke_kremove(sva); mtx_unlock(©_page_mutex); } static inline void mmu_booke_copy_pages(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; mtx_lock(©_page_mutex); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); mmu_booke_kenter(copy_page_src_va, VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)copy_page_src_va + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); mmu_booke_kenter(copy_page_dst_va, VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)copy_page_dst_va + b_pg_offset; bcopy(a_cp, b_cp, cnt); mmu_booke_kremove(copy_page_dst_va); mmu_booke_kremove(copy_page_src_va); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } mtx_unlock(©_page_mutex); } static vm_offset_t mmu_booke_quick_enter_page(vm_page_t m) { vm_paddr_t paddr; vm_offset_t qaddr; uint32_t flags; pte_t *pte; paddr = VM_PAGE_TO_PHYS(m); flags = PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID; flags |= tlb_calc_wimg(paddr, pmap_page_get_memattr(m)) << PTE_MAS2_SHIFT; flags |= PTE_PS_4KB; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = pte_find(kernel_pmap, qaddr); KASSERT(*pte == 0, ("mmu_booke_quick_enter_page: PTE busy")); /* * XXX: tlbivax is broadcast to other cores, but qaddr should * not be present in other TLBs. Is there a better instruction * sequence to use? Or just forget it & use mmu_booke_kenter()... */ __asm __volatile("tlbivax 0, %0" :: "r"(qaddr & MAS2_EPN_MASK)); __asm __volatile("isync; msync"); *pte = PTE_RPN_FROM_PA(paddr) | flags; /* Flush the real memory from the instruction cache. */ if ((flags & (PTE_I | PTE_G)) == 0) __syncicache((void *)qaddr, PAGE_SIZE); return (qaddr); } static void mmu_booke_quick_remove_page(vm_offset_t addr) { pte_t *pte; pte = pte_find(kernel_pmap, addr); KASSERT(PCPU_GET(qmap_addr) == addr, ("mmu_booke_quick_remove_page: invalid address")); KASSERT(*pte != 0, ("mmu_booke_quick_remove_page: PTE not in use")); *pte = 0; critical_exit(); } /**************************************************************************/ /* TID handling */ /**************************************************************************/ /* * Return the largest uint value log such that 2^log <= num. */ static unsigned long ilog2(unsigned long num) { long lz; __asm ("cntlzw %0, %1" : "=r" (lz) : "r" (num)); return (31 - lz); } /* * Invalidate all TLB0 entries which match the given TID. Note this is * dedicated for cases when invalidations should NOT be propagated to other * CPUs. */ static void tid_flush(tlbtid_t tid) { register_t msr; uint32_t mas0, mas1, mas2; int entry, way; /* Don't evict kernel translations */ if (tid == TID_KERNEL) return; msr = mfmsr(); __asm __volatile("wrteei 0"); /* * Newer (e500mc and later) have tlbilx, which doesn't broadcast, so use * it for PID invalidation. */ switch ((mfpvr() >> 16) & 0xffff) { case FSL_E500mc: case FSL_E5500: case FSL_E6500: mtspr(SPR_MAS6, tid << MAS6_SPID0_SHIFT); /* tlbilxpid */ __asm __volatile("isync; .long 0x7c200024; isync; msync"); __asm __volatile("wrtee %0" :: "r"(msr)); return; } for (way = 0; way < TLB0_WAYS; way++) for (entry = 0; entry < TLB0_ENTRIES_PER_WAY; entry++) { mas0 = MAS0_TLBSEL(0) | MAS0_ESEL(way); mtspr(SPR_MAS0, mas0); mas2 = entry << MAS2_TLB0_ENTRY_IDX_SHIFT; mtspr(SPR_MAS2, mas2); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); if (!(mas1 & MAS1_VALID)) continue; if (((mas1 & MAS1_TID_MASK) >> MAS1_TID_SHIFT) != tid) continue; mas1 &= ~MAS1_VALID; mtspr(SPR_MAS1, mas1); __asm __volatile("isync; tlbwe; isync; msync"); } __asm __volatile("wrtee %0" :: "r"(msr)); } diff --git a/sys/powerpc/booke/pmap_64.c b/sys/powerpc/booke/pmap_64.c index 55db7a325be5..05db755740a1 100644 --- a/sys/powerpc/booke/pmap_64.c +++ b/sys/powerpc/booke/pmap_64.c @@ -1,791 +1,788 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2020 Justin Hibbits * Copyright (C) 2007-2009 Semihalf, Rafal Jaworowski * Copyright (C) 2006 Semihalf, Marian Balakowicz * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Some hw specific parts of this pmap were derived or influenced * by NetBSD's ibm4xx pmap module. More generic code is shared with * a few other pmap modules from the FreeBSD tree. */ /* * VM layout notes: * * Kernel and user threads run within one common virtual address space * defined by AS=0. * * 64-bit pmap: * Virtual address space layout: * ----------------------------- * 0x0000_0000_0000_0000 - 0x3fff_ffff_ffff_ffff : user process * 0x4000_0000_0000_0000 - 0x7fff_ffff_ffff_ffff : unused * 0x8000_0000_0000_0000 - 0xbfff_ffff_ffff_ffff : mmio region * 0xc000_0000_0000_0000 - 0xdfff_ffff_ffff_ffff : direct map * 0xe000_0000_0000_0000 - 0xffff_ffff_ffff_ffff : KVA */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEBUG #define debugf(fmt, args...) printf(fmt, ##args) #else #define debugf(fmt, args...) #endif #define PRI0ptrX "016lx" /**************************************************************************/ /* PMAP */ /**************************************************************************/ unsigned int kernel_pdirs; static uma_zone_t ptbl_root_zone; static pte_t ****kernel_ptbl_root; /* * Base of the pmap_mapdev() region. On 32-bit it immediately follows the * userspace address range. On On 64-bit it's far above, at (1 << 63), and * ranges up to the DMAP, giving 62 bits of PA allowed. This is far larger than * the widest Book-E address bus, the e6500 has a 40-bit PA space. This allows * us to map akin to the DMAP, with addresses identical to the PA, offset by the * base. */ #define VM_MAPDEV_BASE 0x8000000000000000 #define VM_MAPDEV_PA_MAX 0x4000000000000000 /* Don't encroach on DMAP */ static void tid_flush(tlbtid_t tid); static unsigned long ilog2(unsigned long); /**************************************************************************/ /* Page table management */ /**************************************************************************/ #define PMAP_ROOT_SIZE (sizeof(pte_t****) * PG_ROOT_NENTRIES) static pte_t *ptbl_alloc(pmap_t pmap, vm_offset_t va, bool nosleep, bool *is_new); static void ptbl_hold(pmap_t, pte_t *); static int ptbl_unhold(pmap_t, vm_offset_t); static vm_paddr_t pte_vatopa(pmap_t, vm_offset_t); static int pte_enter(pmap_t, vm_page_t, vm_offset_t, uint32_t, boolean_t); static int pte_remove(pmap_t, vm_offset_t, uint8_t); static pte_t *pte_find(pmap_t, vm_offset_t); static pte_t *pte_find_next(pmap_t, vm_offset_t *); static void kernel_pte_alloc(vm_offset_t, vm_offset_t); /**************************************************************************/ /* Page table related */ /**************************************************************************/ /* Allocate a page, to be used in a page table. */ static vm_offset_t mmu_booke_alloc_page(pmap_t pmap, unsigned int idx, bool nosleep) { vm_page_t m; int req; - req = VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO; - while ((m = vm_page_alloc(NULL, idx, req)) == NULL) { + req = VM_ALLOC_WIRED | VM_ALLOC_ZERO; + while ((m = vm_page_alloc_noobj(req)) == NULL) { if (nosleep) return (0); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); vm_wait(NULL); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } - - if (!(m->flags & PG_ZERO)) - /* Zero whole ptbl. */ - mmu_booke_zero_page(m); + m->pindex = idx; return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } /* Initialize pool of kva ptbl buffers. */ static void ptbl_init(void) { } /* Get a pointer to a PTE in a page table. */ static __inline pte_t * pte_find(pmap_t pmap, vm_offset_t va) { pte_t ***pdir_l1; pte_t **pdir; pte_t *ptbl; KASSERT((pmap != NULL), ("pte_find: invalid pmap")); pdir_l1 = pmap->pm_root[PG_ROOT_IDX(va)]; if (pdir_l1 == NULL) return (NULL); pdir = pdir_l1[PDIR_L1_IDX(va)]; if (pdir == NULL) return (NULL); ptbl = pdir[PDIR_IDX(va)]; return ((ptbl != NULL) ? &ptbl[PTBL_IDX(va)] : NULL); } /* Get a pointer to a PTE in a page table, or the next closest (greater) one. */ static __inline pte_t * pte_find_next(pmap_t pmap, vm_offset_t *pva) { vm_offset_t va; pte_t ****pm_root; pte_t *pte; unsigned long i, j, k, l; KASSERT((pmap != NULL), ("pte_find: invalid pmap")); va = *pva; i = PG_ROOT_IDX(va); j = PDIR_L1_IDX(va); k = PDIR_IDX(va); l = PTBL_IDX(va); pm_root = pmap->pm_root; /* truncate the VA for later. */ va &= ~((1UL << (PG_ROOT_H + 1)) - 1); for (; i < PG_ROOT_NENTRIES; i++, j = 0, k = 0, l = 0) { if (pm_root[i] == 0) continue; for (; j < PDIR_L1_NENTRIES; j++, k = 0, l = 0) { if (pm_root[i][j] == 0) continue; for (; k < PDIR_NENTRIES; k++, l = 0) { if (pm_root[i][j][k] == NULL) continue; for (; l < PTBL_NENTRIES; l++) { pte = &pm_root[i][j][k][l]; if (!PTE_ISVALID(pte)) continue; *pva = va + PG_ROOT_SIZE * i + PDIR_L1_SIZE * j + PDIR_SIZE * k + PAGE_SIZE * l; return (pte); } } } } return (NULL); } static bool unhold_free_page(pmap_t pmap, vm_page_t m) { if (vm_page_unwire_noq(m)) { vm_page_free_zero(m); return (true); } return (false); } static vm_offset_t get_pgtbl_page(pmap_t pmap, vm_offset_t *ptr_tbl, uint32_t index, bool nosleep, bool hold_parent, bool *isnew) { vm_offset_t page; vm_page_t m; page = ptr_tbl[index]; KASSERT(page != 0 || pmap != kernel_pmap, ("NULL page table page found in kernel pmap!")); if (page == 0) { page = mmu_booke_alloc_page(pmap, index, nosleep); if (ptr_tbl[index] == 0) { *isnew = true; ptr_tbl[index] = page; if (hold_parent) { m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)ptr_tbl)); m->ref_count++; } return (page); } m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS(page)); page = ptr_tbl[index]; vm_page_unwire_noq(m); vm_page_free_zero(m); } *isnew = false; return (page); } /* Allocate page table. */ static pte_t* ptbl_alloc(pmap_t pmap, vm_offset_t va, bool nosleep, bool *is_new) { unsigned int pg_root_idx = PG_ROOT_IDX(va); unsigned int pdir_l1_idx = PDIR_L1_IDX(va); unsigned int pdir_idx = PDIR_IDX(va); vm_offset_t pdir_l1, pdir, ptbl; /* When holding a parent, no need to hold the root index pages. */ pdir_l1 = get_pgtbl_page(pmap, (vm_offset_t *)pmap->pm_root, pg_root_idx, nosleep, false, is_new); if (pdir_l1 == 0) return (NULL); pdir = get_pgtbl_page(pmap, (vm_offset_t *)pdir_l1, pdir_l1_idx, nosleep, !*is_new, is_new); if (pdir == 0) return (NULL); ptbl = get_pgtbl_page(pmap, (vm_offset_t *)pdir, pdir_idx, nosleep, !*is_new, is_new); return ((pte_t *)ptbl); } /* * Decrement ptbl pages hold count and attempt to free ptbl pages. Called * when removing pte entry from ptbl. * * Return 1 if ptbl pages were freed. */ static int ptbl_unhold(pmap_t pmap, vm_offset_t va) { pte_t *ptbl; vm_page_t m; u_int pg_root_idx; pte_t ***pdir_l1; u_int pdir_l1_idx; pte_t **pdir; u_int pdir_idx; pg_root_idx = PG_ROOT_IDX(va); pdir_l1_idx = PDIR_L1_IDX(va); pdir_idx = PDIR_IDX(va); KASSERT((pmap != kernel_pmap), ("ptbl_unhold: unholding kernel ptbl!")); pdir_l1 = pmap->pm_root[pg_root_idx]; pdir = pdir_l1[pdir_l1_idx]; ptbl = pdir[pdir_idx]; /* decrement hold count */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t) ptbl)); if (!unhold_free_page(pmap, m)) return (0); pdir[pdir_idx] = NULL; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t) pdir)); if (!unhold_free_page(pmap, m)) return (1); pdir_l1[pdir_l1_idx] = NULL; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t) pdir_l1)); if (!unhold_free_page(pmap, m)) return (1); pmap->pm_root[pg_root_idx] = NULL; return (1); } /* * Increment hold count for ptbl pages. This routine is used when new pte * entry is being inserted into ptbl. */ static void ptbl_hold(pmap_t pmap, pte_t *ptbl) { vm_page_t m; KASSERT((pmap != kernel_pmap), ("ptbl_hold: holding kernel ptbl!")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t) ptbl)); m->ref_count++; } /* * Clean pte entry, try to free page table page if requested. * * Return 1 if ptbl pages were freed, otherwise return 0. */ static int pte_remove(pmap_t pmap, vm_offset_t va, u_int8_t flags) { vm_page_t m; pte_t *pte; pte = pte_find(pmap, va); KASSERT(pte != NULL, ("%s: NULL pte for va %#jx, pmap %p", __func__, (uintmax_t)va, pmap)); if (!PTE_ISVALID(pte)) return (0); /* Get vm_page_t for mapped pte. */ m = PHYS_TO_VM_PAGE(PTE_PA(pte)); if (PTE_ISWIRED(pte)) pmap->pm_stats.wired_count--; /* Handle managed entry. */ if (PTE_ISMANAGED(pte)) { /* Handle modified pages. */ if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); /* Referenced pages. */ if (PTE_ISREFERENCED(pte)) vm_page_aflag_set(m, PGA_REFERENCED); /* Remove pv_entry from pv_list. */ pv_remove(pmap, va, m); } else if (pmap == kernel_pmap && m && m->md.pv_tracked) { pv_remove(pmap, va, m); if (TAILQ_EMPTY(&m->md.pv_list)) m->md.pv_tracked = false; } mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte = 0; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); pmap->pm_stats.resident_count--; if (flags & PTBL_UNHOLD) { return (ptbl_unhold(pmap, va)); } return (0); } /* * Insert PTE for a given page and virtual address. */ static int pte_enter(pmap_t pmap, vm_page_t m, vm_offset_t va, uint32_t flags, boolean_t nosleep) { unsigned int ptbl_idx = PTBL_IDX(va); pte_t *ptbl, *pte, pte_tmp; bool is_new; /* Get the page directory pointer. */ ptbl = ptbl_alloc(pmap, va, nosleep, &is_new); if (ptbl == NULL) { KASSERT(nosleep, ("nosleep and NULL ptbl")); return (ENOMEM); } if (is_new) { pte = &ptbl[ptbl_idx]; } else { /* * Check if there is valid mapping for requested va, if there * is, remove it. */ pte = &ptbl[ptbl_idx]; if (PTE_ISVALID(pte)) { pte_remove(pmap, va, PTBL_HOLD); } else { /* * pte is not used, increment hold count for ptbl * pages. */ if (pmap != kernel_pmap) ptbl_hold(pmap, ptbl); } } /* * Insert pv_entry into pv_list for mapped page if part of managed * memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { flags |= PTE_MANAGED; /* Create and insert pv entry. */ pv_insert(pmap, va, m); } pmap->pm_stats.resident_count++; pte_tmp = PTE_RPN_FROM_PA(VM_PAGE_TO_PHYS(m)); pte_tmp |= (PTE_VALID | flags); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte = pte_tmp; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); return (0); } /* Return the pa for the given pmap/va. */ static vm_paddr_t pte_vatopa(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa = 0; pte_t *pte; pte = pte_find(pmap, va); if ((pte != NULL) && PTE_ISVALID(pte)) pa = (PTE_PA(pte) | (va & PTE_PA_MASK)); return (pa); } /* allocate pte entries to manage (addr & mask) to (addr & mask) + size */ static void kernel_pte_alloc(vm_offset_t data_end, vm_offset_t addr) { pte_t *pte; vm_size_t kva_size; int kernel_pdirs, kernel_pgtbls, pdir_l1s; vm_offset_t va, l1_va, pdir_va, ptbl_va; int i, j, k; kva_size = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; kernel_pmap->pm_root = kernel_ptbl_root; pdir_l1s = howmany(kva_size, PG_ROOT_SIZE); kernel_pdirs = howmany(kva_size, PDIR_L1_SIZE); kernel_pgtbls = howmany(kva_size, PDIR_SIZE); /* Initialize kernel pdir */ l1_va = (vm_offset_t)kernel_ptbl_root + round_page(PG_ROOT_NENTRIES * sizeof(pte_t ***)); pdir_va = l1_va + pdir_l1s * PAGE_SIZE; ptbl_va = pdir_va + kernel_pdirs * PAGE_SIZE; if (bootverbose) { printf("ptbl_root_va: %#lx\n", (vm_offset_t)kernel_ptbl_root); printf("l1_va: %#lx (%d entries)\n", l1_va, pdir_l1s); printf("pdir_va: %#lx(%d entries)\n", pdir_va, kernel_pdirs); printf("ptbl_va: %#lx(%d entries)\n", ptbl_va, kernel_pgtbls); } va = VM_MIN_KERNEL_ADDRESS; for (i = PG_ROOT_IDX(va); i < PG_ROOT_IDX(va) + pdir_l1s; i++, l1_va += PAGE_SIZE) { kernel_pmap->pm_root[i] = (pte_t ***)l1_va; for (j = 0; j < PDIR_L1_NENTRIES && va < VM_MAX_KERNEL_ADDRESS; j++, pdir_va += PAGE_SIZE) { kernel_pmap->pm_root[i][j] = (pte_t **)pdir_va; for (k = 0; k < PDIR_NENTRIES && va < VM_MAX_KERNEL_ADDRESS; k++, va += PDIR_SIZE, ptbl_va += PAGE_SIZE) kernel_pmap->pm_root[i][j][k] = (pte_t *)ptbl_va; } } /* * Fill in PTEs covering kernel code and data. They are not required * for address translation, as this area is covered by static TLB1 * entries, but for pte_vatopa() to work correctly with kernel area * addresses. */ for (va = addr; va < data_end; va += PAGE_SIZE) { pte = &(kernel_pmap->pm_root[PG_ROOT_IDX(va)][PDIR_L1_IDX(va)][PDIR_IDX(va)][PTBL_IDX(va)]); *pte = PTE_RPN_FROM_PA(kernload + (va - kernstart)); *pte |= PTE_M | PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID | PTE_PS_4KB; } } static vm_offset_t mmu_booke_alloc_kernel_pgtables(vm_offset_t data_end) { vm_size_t kva_size = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; kernel_ptbl_root = (pte_t ****)data_end; data_end += round_page(PG_ROOT_NENTRIES * sizeof(pte_t ***)); data_end += howmany(kva_size, PG_ROOT_SIZE) * PAGE_SIZE; data_end += howmany(kva_size, PDIR_L1_SIZE) * PAGE_SIZE; data_end += howmany(kva_size, PDIR_SIZE) * PAGE_SIZE; return (data_end); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ static int mmu_booke_pinit(pmap_t pmap) { int i; CTR4(KTR_PMAP, "%s: pmap = %p, proc %d '%s'", __func__, pmap, curthread->td_proc->p_pid, curthread->td_proc->p_comm); KASSERT((pmap != kernel_pmap), ("pmap_pinit: initializing kernel_pmap")); for (i = 0; i < MAXCPU; i++) pmap->pm_tid[i] = TID_NONE; CPU_ZERO(&kernel_pmap->pm_active); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_root = uma_zalloc(ptbl_root_zone, M_WAITOK); bzero(pmap->pm_root, sizeof(pte_t **) * PG_ROOT_NENTRIES); return (1); } /* * Release any resources held by the given physical map. * Called when a pmap initialized by mmu_booke_pinit is being released. * Should only be called if the map contains no valid mappings. */ static void mmu_booke_release(pmap_t pmap) { KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); #ifdef INVARIANTS /* * Verify that all page directories are gone. * Protects against reference count leakage. */ for (int i = 0; i < PG_ROOT_NENTRIES; i++) KASSERT(pmap->pm_root[i] == 0, ("Index %d on root page %p is non-zero!\n", i, pmap->pm_root)); #endif uma_zfree(ptbl_root_zone, pmap->pm_root); } static void mmu_booke_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { pte_t *pte; vm_paddr_t pa = 0; int sync_sz, valid; while (sz > 0) { PMAP_LOCK(pm); pte = pte_find(pm, va); valid = (pte != NULL && PTE_ISVALID(pte)) ? 1 : 0; if (valid) pa = PTE_PA(pte); PMAP_UNLOCK(pm); sync_sz = PAGE_SIZE - (va & PAGE_MASK); sync_sz = min(sync_sz, sz); if (valid) { pa += (va & PAGE_MASK); __syncicache((void *)PHYS_TO_DMAP(pa), sync_sz); } va += sync_sz; sz -= sync_sz; } } /* * mmu_booke_zero_page_area zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. * * off and size must reside within a single page. */ static void mmu_booke_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va; /* XXX KASSERT off and size are within a single page? */ va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); bzero((caddr_t)va + off, size); } /* * mmu_booke_zero_page zeros the specified hardware page. */ static void mmu_booke_zero_page(vm_page_t m) { vm_offset_t off, va; va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); } /* * mmu_booke_copy_page copies the specified (machine independent) page by * mapping the page into virtual memory and using memcopy to copy the page, * one machine dependent page at a time. */ static void mmu_booke_copy_page(vm_page_t sm, vm_page_t dm) { vm_offset_t sva, dva; sva = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(sm)); dva = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dm)); memcpy((caddr_t)dva, (caddr_t)sva, PAGE_SIZE); } static inline void mmu_booke_copy_pages(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; vm_page_t pa, pb; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; pa = ma[a_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; pb = mb[b_offset >> PAGE_SHIFT]; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); a_cp = (caddr_t)((uintptr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pa)) + a_pg_offset); b_cp = (caddr_t)((uintptr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pb)) + b_pg_offset); bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } static vm_offset_t mmu_booke_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } static void mmu_booke_quick_remove_page(vm_offset_t addr) { } /**************************************************************************/ /* TID handling */ /**************************************************************************/ /* * Return the largest uint value log such that 2^log <= num. */ static unsigned long ilog2(unsigned long num) { long lz; __asm ("cntlzd %0, %1" : "=r" (lz) : "r" (num)); return (63 - lz); } /* * Invalidate all TLB0 entries which match the given TID. Note this is * dedicated for cases when invalidations should NOT be propagated to other * CPUs. */ static void tid_flush(tlbtid_t tid) { register_t msr; /* Don't evict kernel translations */ if (tid == TID_KERNEL) return; msr = mfmsr(); __asm __volatile("wrteei 0"); /* * Newer (e500mc and later) have tlbilx, which doesn't broadcast, so use * it for PID invalidation. */ mtspr(SPR_MAS6, tid << MAS6_SPID0_SHIFT); __asm __volatile("isync; .long 0x7c200024; isync; msync"); __asm __volatile("wrtee %0" :: "r"(msr)); } diff --git a/sys/powerpc/powerpc/uma_machdep.c b/sys/powerpc/powerpc/uma_machdep.c index a1769f61d671..ac3d98e18d0b 100644 --- a/sys/powerpc/powerpc/uma_machdep.c +++ b/sys/powerpc/powerpc/uma_machdep.c @@ -1,103 +1,100 @@ /*- * Copyright (c) 2003 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include static int hw_uma_mdpages; SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw_uma_mdpages, 0, "UMA MD pages in use"); void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) { void *va; vm_paddr_t pa; vm_page_t m; *flags = UMA_SLAB_PRIV; - m = vm_page_alloc_domain(NULL, 0, domain, - malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); + m = vm_page_alloc_noobj_domain(domain, malloc2vm_flags(wait) | + VM_ALLOC_WIRED); if (m == NULL) return (NULL); pa = VM_PAGE_TO_PHYS(m); #ifdef __powerpc64__ if ((wait & M_NODUMP) == 0) dump_add_page(pa); #endif if (!hw_direct_map) { pmap_kenter(pa, pa); va = (void *)(vm_offset_t)pa; } else { va = (void *)(vm_offset_t)PHYS_TO_DMAP(pa); } - - if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) - bzero(va, PAGE_SIZE); atomic_add_int(&hw_uma_mdpages, 1); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; if (hw_direct_map) m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)mem)); else { m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)mem)); pmap_kremove((vm_offset_t)mem); } KASSERT(m != NULL, ("Freeing UMA block at %p with no associated page", mem)); #ifdef __powerpc64__ dump_drop_page(VM_PAGE_TO_PHYS(m)); #endif vm_page_unwire_noq(m); vm_page_free(m); atomic_subtract_int(&hw_uma_mdpages, 1); } diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index 292b1c2f6b3f..e1ff056117eb 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -1,4792 +1,4776 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * Copyright (c) 2015-2018 Ruslan Bukin * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Portions of this software were developed by Andrew Turner under * sponsorship from The FreeBSD Foundation. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) #define NUL2E (Ln_ENTRIES * NUL1E) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* The list of all the user pmaps */ LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); static struct rwlock_padalign pvh_global_lock; static struct mtx_padalign allpmaps_lock; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM/pmap parameters"); static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN, &superpages_enabled, 0, "Enable support for transparent superpages"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_mappings; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l2_mappings, 0, "2MB page mappings"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; extern cpuset_t all_harts; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); #define pmap_clear(pte) pmap_store(pte, 0) #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) #define pmap_load_clear(pte) pmap_load_store(pte, 0) #define pmap_load(pte) atomic_load_64(pte) #define pmap_store(pte, entry) atomic_store_64(pte, entry) #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline void pagezero(void *p) { bzero(p, PAGE_SIZE); } #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) #define PTE_TO_PHYS(pte) \ ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) #define L2PTE_TO_PHYS(l2) \ ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT) static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { KASSERT(VIRT_IS_VALID(va), ("%s: malformed virtual address %#lx", __func__, va)); return (&pmap->pm_l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { vm_paddr_t phys; pd_entry_t *l2; phys = PTE_TO_PHYS(pmap_load(l1)); l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & PTE_V) == 0) return (NULL); if ((pmap_load(l1) & PTE_RX) != 0) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { vm_paddr_t phys; pt_entry_t *l3; phys = PTE_TO_PHYS(pmap_load(l2)); l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); return (&l3[pmap_l3_index(va)]); } static __inline pt_entry_t * pmap_l3(pmap_t pmap, vm_offset_t va) { pd_entry_t *l2; l2 = pmap_l2(pmap, va); if (l2 == NULL) return (NULL); if ((pmap_load(l2) & PTE_V) == 0) return (NULL); if ((pmap_load(l2) & PTE_RX) != 0) return (NULL); return (pmap_l2_to_l3(l2, va)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static void pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, pt_entry_t entry) { struct pmap *user_pmap; pd_entry_t *l1; /* Distribute new kernel L1 entry to all the user pmaps */ if (pmap != kernel_pmap) return; mtx_lock(&allpmaps_lock); LIST_FOREACH(user_pmap, &allpmaps, pm_list) { l1 = &user_pmap->pm_l1[l1index]; pmap_store(l1, entry); } mtx_unlock(&allpmaps_lock); } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & PTE_RX) == 0, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; vm_paddr_t ret; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); /* Check locore has used L2 superpages */ KASSERT((l2[l2_slot] & PTE_RX) != 0, ("Invalid bootstrap L2 table")); /* L2 is superpages */ ret = L2PTE_TO_PHYS(l2[l2_slot]); ret += (va & L2_OFFSET); return (ret); } static void pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) { vm_offset_t va; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; pt_entry_t entry; pn_t pn; pa = dmap_phys_base = min_pa & ~L1_OFFSET; va = DMAP_MIN_ADDRESS; l1 = (pd_entry_t *)kern_l1; l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); for (; va < DMAP_MAX_ADDRESS && pa < max_pa; pa += L1_SIZE, va += L1_SIZE, l1_slot++) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); /* superpages */ pn = (pa / PAGE_SIZE); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(&l1[l1_slot], entry); } /* Set the upper limit of the DMAP region */ dmap_phys_max = pa; dmap_max_addr = va; sfence_vma(); } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l3pt; pt_entry_t entry; pd_entry_t *l2; vm_paddr_t pa; u_int l2_slot; pn_t pn; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pn = (pa / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(&l2[l2_slot], entry); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return (l3pt); } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot; vm_offset_t freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t max_pa, min_pa, pa; pt_entry_t *l2p; int i; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; PMAP_LOCK_INIT(kernel_pmap); rw_init(&pvh_global_lock, "pmap pv global"); /* * Set the current CPU as active in the kernel pmap. Secondary cores * will add themselves later in init_secondary(). The SBI firmware * may rely on this mask being precise, so CPU_FILL() is not used. */ CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active); /* Assume the address we were loaded to is a valid physical address. */ min_pa = max_pa = kernstart; physmap_idx = physmem_avail(physmap, nitems(physmap)); physmap_idx /= 2; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < physmap_idx * 2; i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; if (physmap[i + 1] > max_pa) max_pa = physmap[i + 1]; } printf("physmap_idx %u\n", physmap_idx); printf("min_pa %lx\n", min_pa); printf("max_pa %lx\n", max_pa); /* Create a direct map region early so we can use it for pa -> va */ pmap_bootstrap_dmap(l1pt, min_pa, max_pa); /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); /* Create the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); /* * Invalidate the mapping we created for the DTB. At this point a copy * has been created, and we no longer need it. We want to avoid the * possibility of an aliased mapping in the future. */ l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS); if ((pmap_load(l2p) & PTE_V) != 0) pmap_clear(l2p); sfence_vma(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; virtual_avail = roundup2(freemempos, L2_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; int i, pv_npg; /* * Initialize the pv chunk and pmap list mutexes. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); if (superpages_enabled) pagesizes[1] = L2_SIZE; } #ifdef SMP /* * For SMP, these functions have to use IPIs for coherence. * * In general, the calling thread uses a plain fence to order the * writes to the page tables before invoking an SBI callback to invoke * sfence_vma() on remote CPUs. */ static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, va, 1); sfence_vma_page(va); sched_unpin(); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); /* * Might consider a loop of sfence_vma_page() for a small * number of pages in the future. */ sfence_vma(); sched_unpin(); } static void pmap_invalidate_all(pmap_t pmap) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); /* * XXX: The SBI doc doesn't detail how to specify x0 as the * address to perform a global fence. BBL currently treats * all sfence_vma requests as global however. */ fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, 0, 0); sfence_vma(); sched_unpin(); } #else /* * Normal, non-SMP, invalidation functions. * We inline these within pmap.c for speed. */ static __inline void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sfence_vma_page(va); } static __inline void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { /* * Might consider a loop of sfence_vma_page() for a small * number of pages in the future. */ sfence_vma(); } static __inline void pmap_invalidate_all(pmap_t pmap) { sfence_vma(); } #endif /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pd_entry_t *l2p, l2; pt_entry_t *l3p, l3; vm_paddr_t pa; pa = 0; PMAP_LOCK(pmap); /* * Start with the l2 tabel. We are unable to allocate * pages in the l1 table. */ l2p = pmap_l2(pmap, va); if (l2p != NULL) { l2 = pmap_load(l2p); if ((l2 & PTE_RX) == 0) { l3p = pmap_l2_to_l3(l2p, va); if (l3p != NULL) { l3 = pmap_load(l3p); pa = PTE_TO_PHYS(l3); pa |= (va & L3_OFFSET); } } else { /* L2 is superpages */ pa = L2PTE_TO_PHYS(l2); pa |= (va & L2_OFFSET); } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *l3p, l3; vm_paddr_t phys; vm_page_t m; m = NULL; PMAP_LOCK(pmap); l3p = pmap_l3(pmap, va); if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { phys = PTE_TO_PHYS(l3); m = PHYS_TO_VM_PAGE(phys); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t *l2, l2e; pt_entry_t *l3; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { l2 = pmap_l2(kernel_pmap, va); if (l2 == NULL) panic("pmap_kextract: No l2"); l2e = pmap_load(l2); /* * Beware of concurrent promotion and demotion! We must * use l2e rather than loading from l2 multiple times to * ensure we see a consistent state, including the * implicit load in pmap_l2_to_l3. It is, however, safe * to use an old l2e because the L3 page is preserved by * promotion. */ if ((l2e & PTE_RX) != 0) { /* superpages */ pa = L2PTE_TO_PHYS(l2e); pa |= (va & L2_OFFSET); return (pa); } l3 = pmap_l2_to_l3(&l2e, va); if (l3 == NULL) panic("pmap_kextract: No l3..."); pa = PTE_TO_PHYS(pmap_load(l3)); pa |= (va & PAGE_MASK); } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode __unused) { pt_entry_t entry; pt_entry_t *l3; vm_offset_t va; pn_t pn; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter_device: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter_device: Mapping is not page-sized")); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); pn = (pa / PAGE_SIZE); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(l3, entry); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *l3; l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); pmap_clear(l3); sfence_vma(); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *l3; vm_offset_t va; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); pmap_clear(l3); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *l3, pa; vm_offset_t va; vm_page_t m; pt_entry_t entry; pn_t pn; int i; va = sva; for (i = 0; i < count; i++) { m = ma[i]; pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); l3 = pmap_l3(kernel_pmap, va); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(l3, entry); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *l3; vm_offset_t va; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); for (va = sva; count-- > 0; va += PAGE_SIZE) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); pmap_clear(l3); } pmap_invalidate_range(kernel_pmap, sva, va); } bool pmap_ps_enabled(pmap_t pmap __unused) { return (superpages_enabled); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "ml3" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, ml3)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else { return (FALSE); } } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { vm_paddr_t phys; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (m->pindex >= NUL2E) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); pmap_distribute_l1(pmap, pmap_l1_index(va), 0); } else { pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { pd_entry_t *l1; vm_page_t pdpg; l1 = pmap_l1(pmap, va); phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); pmap_unwire_ptp(pmap, va, pdpg, free); } pmap_invalidate_page(pmap, va); vm_wire_sub(1); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l1 = kernel_pmap->pm_l1; pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); CPU_ZERO(&pmap->pm_active); pmap_activate_boot(pmap); } int pmap_pinit(pmap_t pmap) { vm_paddr_t l1phys; vm_page_t l1pt; /* * allocate the l1 page */ - while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) - vm_wait(NULL); + l1pt = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO | + VM_ALLOC_WAITOK); l1phys = VM_PAGE_TO_PHYS(l1pt); pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); - if ((l1pt->flags & PG_ZERO) == 0) - pagezero(pmap->pm_l1); - bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); CPU_ZERO(&pmap->pm_active); /* Install kernel pagetables */ memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); /* Add to the list of all user pmaps */ mtx_lock(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock(&allpmaps_lock); vm_radix_init(&pmap->pm_root); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, /*pdppg, */pdpg; pt_entry_t entry; vm_paddr_t phys; pn_t pn; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ - if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); rw_runlock(&pvh_global_lock); vm_wait(NULL); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } - - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(m); + m->pindex = ptepindex; /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= NUL2E) { pd_entry_t *l1; vm_pindex_t l1index; l1index = ptepindex - NUL2E; l1 = &pmap->pm_l1[l1index]; KASSERT((pmap_load(l1) & PTE_V) == 0, ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(pmap, l1index, entry); } else { vm_pindex_t l1index; pd_entry_t *l1, *l2; l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); l1 = &pmap->pm_l1[l1index]; if (pmap_load(l1) == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); pdpg->ref_count++; } phys = PTE_TO_PHYS(pmap_load(l1)); l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); l2 = &l2[ptepindex & Ln_ADDR_MASK]; KASSERT((pmap_load(l2) & PTE_V) == 0, ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l2, entry); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { pd_entry_t *l1; vm_page_t l2pg; vm_pindex_t l2pindex; retry: l1 = pmap_l1(pmap, va); if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) { KASSERT((pmap_load(l1) & PTE_RWX) == 0, ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__, pmap_load(l1), va)); /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); l2pg->ref_count++; } else { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); if (l2pg == NULL && lockp != NULL) goto retry; } return (l2pg); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *l2; vm_paddr_t phys; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ l2 = pmap_l2(pmap, va); /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); m = PHYS_TO_VM_PAGE(phys); m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); mtx_lock(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock(&allpmaps_lock); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); vm_page_unwire_noq(m); vm_page_free(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l1, *l2; pt_entry_t entry; pn_t pn; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l1 = pmap_l1(kernel_pmap, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ - nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | + nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); - if ((nkpg->flags & PG_ZERO) == 0) - pmap_zero_page(nkpg); + nkpg->pindex = kernel_vm_end >> L1_SHIFT; paddr = VM_PAGE_TO_PHYS(nkpg); pn = (paddr / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(kernel_pmap, pmap_l1_index(kernel_vm_end), entry); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if ((pmap_load(l2) & PTE_V) != 0 && (pmap_load(l2) & PTE_RWX) == 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } - nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); - if ((nkpg->flags & PG_ZERO) == 0) { - pmap_zero_page(nkpg); - } + nkpg->pindex = kernel_vm_end >> L2_SHIFT; paddr = VM_PAGE_TO_PHYS(nkpg); pn = (paddr / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l2, entry); pmap_invalidate_page(kernel_pmap, kernel_vm_end); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { panic("RISCVTODO: reclaim_pv_chunk"); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } /* XXX PV STATS */ #if 0 dump_add_page(m->phys_addr); #endif pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void __unused pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_page_t m; vm_offset_t va_last; int bit, field; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va &= ~L2_OFFSET; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining 511 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field] != 0) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } /* XXX PV stats */ } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_page_t m; vm_offset_t va_last; rw_assert(&pvh_global_lock, RA_LOCKED); KASSERT((va & L2_OFFSET) == 0, ("pmap_pv_promote_l2: misaligned va %#lx", va)); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); m = PHYS_TO_VM_PAGE(pa); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = PTE_TO_PHYS(l2e); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { pt_entry_t newl2, oldl2; vm_page_t ml3; vm_paddr_t ml3pa; KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); if (ml3 == NULL) panic("pmap_remove_kernel_l2: Missing pt page"); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = ml3pa | PTE_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (ml3->valid != 0) pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. */ oldl2 = pmap_load_store(l2, newl2); KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", __func__, l2, oldl2)); } /* * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldl2; vm_offset_t eva, va; vm_page_t m, ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); oldl2 = pmap_load_clear(l2); KASSERT((oldl2 & PTE_RWX) != 0, ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); /* * The sfence.vma documentation states that it is sufficient to specify * a single address within a superpage mapping. However, since we do * not perform any invalidation upon promotion, TLBs may still be * caching 4KB mappings within the superpage, so we must invalidate the * entire range. */ pmap_invalidate_range(pmap, sva, sva + L2_SIZE); if ((oldl2 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if ((oldl2 & PTE_SW_MANAGED) != 0) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); pmap_pvh_free(pvh, pmap, sva); eva = sva + L2_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); va < eva; va += PAGE_SIZE, m++) { if ((oldl2 & PTE_D) != 0) vm_page_dirty(m); if ((oldl2 & PTE_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l2(pmap, l2, sva); } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->ref_count == Ln_ENTRIES, ("pmap_remove_l2: l3 page ref count error")); ml3->ref_count = 1; vm_page_unwire_noq(ml3); pmap_add_delayed_free_list(ml3, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l3; vm_paddr_t phys; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & PTE_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & PTE_SW_MANAGED) { phys = PTE_TO_PHYS(old_l3); m = PHYS_TO_VM_PAGE(phys); if ((old_l3 & PTE_D) != 0) vm_page_dirty(m); if (old_l3 & PTE_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct spglist free; struct rwlock *lock; vm_offset_t va, va_next; pd_entry_t *l1, *l2, l2e; pt_entry_t *l3; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; if ((l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { (void)pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { /* * The large page mapping was destroyed. */ continue; } l2e = pmap_load(l2); } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { sva += L3_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct spglist free; struct md_page *pvh; pmap_t pmap; pt_entry_t *l3, l3e; pd_entry_t *l2, l2e; pv_entry_t pv; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); rw_wlock(&pvh_global_lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); va = pv->pv_va; l2 = pmap_l2(pmap, va); (void)pmap_demote_l2(pmap, l2, va); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap_resident_count_dec(pmap, 1); l2 = pmap_l2(pmap, pv->pv_va); KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); l2e = pmap_load(l2); KASSERT((l2e & PTE_RX) == 0, ("pmap_remove_all: found a superpage in %p's pv list", m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); l3e = pmap_load_clear(l3); pmap_invalidate_page(pmap, pv->pv_va); if (l3e & PTE_SW_WIRED) pmap->pm_stats.wired_count--; if ((l3e & PTE_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((l3e & PTE_D) != 0) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pd_entry_t *l1, *l2, l2e; pt_entry_t *l3, l3e, mask; vm_page_t m, mt; vm_paddr_t pa; vm_offset_t va_next; bool anychanged, pv_lists_locked; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; anychanged = false; pv_lists_locked = false; mask = 0; if ((prot & VM_PROT_WRITE) == 0) mask |= PTE_W | PTE_D; if ((prot & VM_PROT_EXECUTE) == 0) mask |= PTE_X; resume: PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL || (l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { retryl2: if ((prot & VM_PROT_WRITE) == 0 && (l2e & (PTE_SW_MANAGED | PTE_D)) == (PTE_SW_MANAGED | PTE_D)) { pa = PTE_TO_PHYS(l2e); m = PHYS_TO_VM_PAGE(pa); for (mt = m; mt < &m[Ln_ENTRIES]; mt++) vm_page_dirty(mt); } if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) goto retryl2; anychanged = true; continue; } else { if (!pv_lists_locked) { pv_lists_locked = true; if (!rw_try_rlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); rw_rlock(&pvh_global_lock); goto resume; } } if (!pmap_demote_l2(pmap, l2, sva)) { /* * The large page mapping was destroyed. */ continue; } } } if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { l3e = pmap_load(l3); retryl3: if ((l3e & PTE_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0 && (l3e & (PTE_SW_MANAGED | PTE_D)) == (PTE_SW_MANAGED | PTE_D)) { m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); vm_page_dirty(m); } if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) goto retryl3; anychanged = true; } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } int pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) { pd_entry_t *l2, l2e; pt_entry_t bits, *pte, oldpte; int rv; rv = 0; PMAP_LOCK(pmap); l2 = pmap_l2(pmap, va); if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) goto done; if ((l2e & PTE_RWX) == 0) { pte = pmap_l2_to_l3(l2, va); if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0) goto done; } else { pte = l2; oldpte = l2e; } if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) goto done; bits = PTE_A; if (ftype == VM_PROT_WRITE) bits |= PTE_D; /* * Spurious faults can occur if the implementation caches invalid * entries in the TLB, or if simultaneous accesses on multiple CPUs * race with each other. */ if ((oldpte & bits) != bits) pmap_store_bits(pte, bits); sfence_vma(); rv = 1; done: PMAP_UNLOCK(pmap); return (rv); } static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) { struct rwlock *lock; bool rv; lock = NULL; rv = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { struct spglist free; vm_page_t mpte; pd_entry_t newl2, oldl2; pt_entry_t *firstl3, newl3; vm_paddr_t mptepa; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl2 = pmap_load(l2); KASSERT((oldl2 & PTE_RWX) != 0, ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { - if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, - pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : - VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == - NULL) { + if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj( + (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | + VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, pmap_load(pmap_l1(pmap, va)), &free, lockp); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_l2_locked: " "failure for va %#lx in pmap %p", va, pmap); return (false); } + mpte->pindex = pmap_l2_pindex(va); if (va < VM_MAXUSER_ADDRESS) { mpte->ref_count = Ln_ENTRIES; pmap_resident_count_inc(pmap, 1); } } mptepa = VM_PAGE_TO_PHYS(mpte); firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; KASSERT((oldl2 & PTE_A) != 0, ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); newl3 = oldl2; /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) { for (i = 0; i < Ln_ENTRIES; i++) pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); } KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " "addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) for (i = 0; i < Ln_ENTRIES; i++) pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the L2 entry. Otherwise, the * state of the L2 entry and the PV lists will be inconsistent, which * can result in reclaim_pv_chunk() attempting to remove a PV entry from * the wrong PV list and pmap_pv_demote_l2() failing to find the * expected PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & PTE_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); /* * Demote the mapping. */ pmap_store(l2, newl2); /* * Demote the PV entry. */ if ((oldl2 & PTE_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", va, pmap); return (true); } #if VM_NRESERVLEVEL > 0 static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, firstl3e, *l3, l3e; vm_paddr_t pa; vm_page_t ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); va &= ~L2_OFFSET; KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("pmap_promote_l2: invalid l2 entry %p", l2)); firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); firstl3e = pmap_load(firstl3); pa = PTE_TO_PHYS(firstl3e); if ((pa & L2_OFFSET) != 0) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } /* * Downgrade a clean, writable mapping to read-only to ensure that the * hardware does not set PTE_D while we are comparing PTEs. * * Upon a write access to a clean mapping, the implementation will * either atomically check protections and set PTE_D, or raise a page * fault. In the latter case, the pmap lock provides atomicity. Thus, * we do not issue an sfence.vma here and instead rely on pmap_fault() * to do so lazily. */ while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) { if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) { firstl3e &= ~PTE_W; break; } } pa += PAGE_SIZE; for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { l3e = pmap_load(l3); if (PTE_TO_PHYS(l3e) != pa) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } while ((l3e & (PTE_W | PTE_D)) == PTE_W) { if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) { l3e &= ~PTE_W; break; } } if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } pa += PAGE_SIZE; } ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); KASSERT(ml3->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, ml3, true)) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } if ((firstl3e & PTE_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp); pmap_store(l2, firstl3e); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } #endif /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *l1, *l2, l2e; pt_entry_t new_l3, orig_l3; pt_entry_t *l3; pv_entry_t pv; vm_paddr_t opa, pa, l2_pa, l3_pa; vm_page_t mpte, om, l2_m, l3_m; pt_entry_t entry; pn_t l2_pn, l3_pn, pn; int rv; bool nosleep; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); new_l3 = PTE_V | PTE_R | PTE_A; if (prot & VM_PROT_EXECUTE) new_l3 |= PTE_X; if (flags & VM_PROT_WRITE) new_l3 |= PTE_D; if (prot & VM_PROT_WRITE) new_l3 |= PTE_W; if (va < VM_MAX_USER_ADDRESS) new_l3 |= PTE_U; new_l3 |= (pn << PTE_PPN0_S); if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= PTE_SW_WIRED; /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if (prot & VM_PROT_WRITE) new_l3 |= PTE_D; } else new_l3 |= PTE_SW_MANAGED; CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); lock = NULL; mpte = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va %#lx unaligned", va)); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); goto out; } l2 = pmap_l2(pmap, va); if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, va, &lock))) { l3 = pmap_l2_to_l3(l2, va); if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); mpte->ref_count++; } } else if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } l3 = pmap_l3(pmap, va); } else { l3 = pmap_l3(pmap, va); /* TODO: This is not optimal, but should mostly work */ if (l3 == NULL) { if (l2 == NULL) { - l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + l2_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l2_m == NULL) panic("pmap_enter: l2 pte_m == NULL"); - if ((l2_m->flags & PG_ZERO) == 0) - pmap_zero_page(l2_m); l2_pa = VM_PAGE_TO_PHYS(l2_m); l2_pn = (l2_pa / PAGE_SIZE); l1 = pmap_l1(pmap, va); entry = (PTE_V); entry |= (l2_pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(pmap, pmap_l1_index(va), entry); l2 = pmap_l1_to_l2(l1, va); } - l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); + l3_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | + VM_ALLOC_ZERO); if (l3_m == NULL) panic("pmap_enter: l3 pte_m == NULL"); if ((l3_m->flags & PG_ZERO) == 0) pmap_zero_page(l3_m); l3_pa = VM_PAGE_TO_PHYS(l3_m); l3_pn = (l3_pa / PAGE_SIZE); entry = (PTE_V); entry |= (l3_pn << PTE_PPN0_S); pmap_store(l2, entry); l3 = pmap_l2_to_l3(l2, va); } pmap_invalidate_page(pmap, va); } orig_l3 = pmap_load(l3); opa = PTE_TO_PHYS(orig_l3); pv = NULL; /* * Is the specified virtual address already mapped? */ if ((orig_l3 & PTE_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & PTE_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & PTE_SW_MANAGED) != 0 && (new_l3 & PTE_W) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ orig_l3 = pmap_load_clear(l3); KASSERT(PTE_TO_PHYS(orig_l3) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((orig_l3 & PTE_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((orig_l3 & PTE_D) != 0) vm_page_dirty(om); if ((orig_l3 & PTE_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#lx", va)); if ((new_l3 & PTE_SW_MANAGED) == 0) free_pv_entry(pmap, pv); if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); orig_l3 = 0; } else { /* * Increment the counters. */ if ((new_l3 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((new_l3 & PTE_SW_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & PTE_W) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } validate: /* * Sync the i-cache on all harts before updating the PTE * if the new PTE is executable. */ if (prot & VM_PROT_EXECUTE) pmap_sync_icache(pmap, va, PAGE_SIZE); /* * Update the L3 entry. */ if (orig_l3 != 0) { orig_l3 = pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); KASSERT(PTE_TO_PHYS(orig_l3) == pa, ("pmap_enter: invalid update")); if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == (PTE_D | PTE_SW_MANAGED)) vm_page_dirty(m); } else { pmap_store(l3, new_l3); } #if VM_NRESERVLEVEL > 0 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_l2(pmap, l2, va, &lock); #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t new_l2; pn_t pn; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); if ((m->oflags & VPO_UNMANAGED) == 0) new_l2 |= PTE_SW_MANAGED; if ((prot & VM_PROT_EXECUTE) != 0) new_l2 |= PTE_X; if (va < VM_MAXUSER_ADDRESS) new_l2 |= PTE_U; return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t *l2, *l3, oldl2; vm_offset_t sva; vm_page_t l2pg, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; if ((oldl2 = pmap_load(l2)) != 0) { KASSERT(l2pg->ref_count > 1, ("pmap_enter_l2: l2pg's ref count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { l2pg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_FAILURE); } SLIST_INIT(&free); if ((oldl2 & PTE_RWX) != 0) (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); else for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { l3 = pmap_l2_to_l3(l2, sva); if ((pmap_load(l3) & PTE_V) != 0 && pmap_remove_l3(pmap, l3, sva, oldl2, &free, lockp) != 0) break; } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_l2() and pmap_remove_l3() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); } else KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); } if ((new_l2 & PTE_SW_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { /* * Although "va" is not mapped, paging-structure * caches could nonetheless have entries that * refer to the freed page table pages. * Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((new_l2 & PTE_W) != 0) for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } /* * Increment counters. */ if ((new_l2 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; /* * Map the superpage. */ pmap_store(l2, new_l2); atomic_add_long(&pmap_l2_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; vm_paddr_t phys; pd_entry_t *l2; pt_entry_t *l3, newl3; KASSERT(!VA_IS_CLEANMAP(va) || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->ref_count++; } else { /* * Get the l2 entry */ l2 = pmap_l2(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); mpte = PHYS_TO_VM_PAGE(phys); mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; l3 = pmap_l3(kernel_pmap, va); } if (l3 == NULL) panic("pmap_enter_quick_locked: No l3"); if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->ref_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, false); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | PTE_V | PTE_R; if ((prot & VM_PROT_EXECUTE) != 0) newl3 |= PTE_X; if ((m->oflags & VPO_UNMANAGED) == 0) newl3 |= PTE_SW_MANAGED; if (va < VM_MAX_USER_ADDRESS) newl3 |= PTE_U; /* * Sync the i-cache on all harts before updating the PTE * if the new PTE is executable. */ if (prot & VM_PROT_EXECUTE) pmap_sync_icache(pmap, va, PAGE_SIZE); pmap_store(l3, newl3); pmap_invalidate_page(pmap, va); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l1, *l2, l2e; pt_entry_t *l3, l3e; bool pv_lists_locked; pv_lists_locked = false; retry: PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if ((l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { if ((l2e & PTE_SW_WIRED) == 0) panic("pmap_unwire: l2 %#jx is missing " "PTE_SW_WIRED", (uintmax_t)l2e); pmap_clear_bits(l2, PTE_SW_WIRED); continue; } else { if (!pv_lists_locked) { pv_lists_locked = true; if (!rw_try_rlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); rw_rlock(&pvh_global_lock); /* Repeat sva. */ goto retry; } } if (!pmap_demote_l2(pmap, l2, sva)) panic("pmap_unwire: demotion failed"); } } if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if ((l3e = pmap_load(l3)) == 0) continue; if ((l3e & PTE_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "PTE_SW_WIRED", (uintmax_t)l3e); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ pmap_clear_bits(l3, PTE_SW_WIRED); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pd_entry_t *l2; pt_entry_t *l3; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("%s: found a 2mpage in page %p's pv list", __func__, m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); if ((pmap_load(l3) & PTE_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); if ((pmap_load(l2) & PTE_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (count); } /* * Returns true if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns false. */ bool pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; bool rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (false); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } static void pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, struct spglist *free, bool superpage) { struct md_page *pvh; vm_page_t mpte, mt; if (superpage) { pmap_resident_count_dec(pmap, Ln_ENTRIES); pvh = pa_to_pvh(m->phys_addr); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[Ln_ENTRIES]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list) && (mt->a.flags & PGA_WRITEABLE) != 0) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->ref_count == Ln_ENTRIES, ("pmap_remove_pages: pte page ref count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->a.flags & PGA_WRITEABLE) != 0) { pvh = pa_to_pvh(m->phys_addr); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { struct spglist free; pd_entry_t ptepde; pt_entry_t *pte, tpte; vm_page_t m, mt; pv_entry_t pv; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; bool superpage; lock = NULL; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_l1(pmap, pv->pv_va); ptepde = pmap_load(pte); pte = pmap_l1_to_l2(pte, pv->pv_va); tpte = pmap_load(pte); if ((tpte & PTE_RWX) != 0) { superpage = true; } else { ptepde = tpte; pte = pmap_l2_to_l3(pte, pv->pv_va); tpte = pmap_load(pte); superpage = false; } /* * We cannot remove wired pages from a * process' mapping at this time. */ if (tpte & PTE_SW_WIRED) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); pmap_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { if (superpage) for (mt = m; mt < &m[Ln_ENTRIES]; mt++) vm_page_dirty(mt); else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; pmap_remove_pages_pv(pmap, m, pv, &free, superpage); pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } static bool pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct md_page *pvh; struct rwlock *lock; pd_entry_t *l2; pt_entry_t *l3, mask; pv_entry_t pv; pmap_t pmap; int md_gen, pvh_gen; bool rv; mask = 0; if (modified) mask |= PTE_D; if (accessed) mask |= PTE_A; rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("%s: found a 2mpage in page %p's pv list", __func__, m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); rv = (pmap_load(l3) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); rv = (pmap_load(l2) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *l3; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); l3 = pmap_l3(pmap, addr); if (l3 != NULL && pmap_load(l3) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pd_entry_t *l2; pt_entry_t *l3, oldl3, newl3; pv_entry_t next_pv, pv; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); rw_rlock(&pvh_global_lock); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); if ((pmap_load(l2) & PTE_W) != 0) (void)pmap_demote_l2_locked(pmap, l2, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("%s: found a 2mpage in page %p's pv list", __func__, m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); oldl3 = pmap_load(l3); retry: if ((oldl3 & PTE_W) != 0) { newl3 = oldl3 & ~(PTE_D | PTE_W); if (!atomic_fcmpset_long(l3, &oldl3, newl3)) goto retry; if ((oldl3 & PTE_D) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); rw_runlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct spglist free; struct md_page *pvh; struct rwlock *lock; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *l2, l2e; pt_entry_t *l3, l3e; vm_paddr_t pa; vm_offset_t va; int cleared, md_gen, not_cleared, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); lock = PHYS_TO_PV_LIST_LOCK(pa); rw_rlock(&pvh_global_lock); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); l2e = pmap_load(l2); if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { /* * Although l2e is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((l2e & PTE_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (l2e & PTE_SW_WIRED) == 0) { pmap_clear_bits(l2, PTE_A); pmap_invalidate_page(pmap, va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RX) == 0, ("pmap_ts_referenced: found an invalid l2 table")); l3 = pmap_l2_to_l3(l2, pv->pv_va); l3e = pmap_load(l3); if ((l3e & PTE_D) != 0) vm_page_dirty(m); if ((l3e & PTE_A) != 0) { if ((l3e & PTE_SW_WIRED) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_clear_bits(l3, PTE_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); rw_runlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *l2, oldl2; pt_entry_t *l3; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->a.flags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(&pvh_global_lock); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); oldl2 = pmap_load(l2); /* If oldl2 has PTE_W set, then it also has PTE_D set. */ if ((oldl2 & PTE_W) != 0 && pmap_demote_l2_locked(pmap, l2, va, &lock) && (oldl2 & PTE_SW_WIRED) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); l3 = pmap_l2_to_l3(l2, va); pmap_clear_bits(l3, PTE_D | PTE_W); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("%s: found a 2mpage in page %p's pv list", __func__, m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { pmap_clear_bits(l3, PTE_D | PTE_W); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); rw_runlock(&pvh_global_lock); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return ((void *)PHYS_TO_DMAP(pa)); } void pmap_unmapbios(vm_paddr_t pa, vm_size_t size) { } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pv_memattr) != 0) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *l1, l1e; pd_entry_t *l2, l2e; pt_entry_t *l3, l3e; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); if (!VIRT_IN_DMAP(base) && !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) return (EINVAL); for (tmpva = base; tmpva < base + size; ) { l1 = pmap_l1(kernel_pmap, tmpva); if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0) return (EINVAL); if ((l1e & PTE_RWX) != 0) { /* * TODO: Demote if attributes don't match and there * isn't an L1 page left in the range, and update the * L1 entry if the attributes don't match but there is * an L1 page left in the range, once we support the * upcoming Svpbmt extension. */ tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; continue; } l2 = pmap_l1_to_l2(l1, tmpva); if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) return (EINVAL); if ((l2e & PTE_RWX) != 0) { /* * TODO: Demote if attributes don't match and there * isn't an L2 page left in the range, and update the * L2 entry if the attributes don't match but there is * an L2 page left in the range, once we support the * upcoming Svpbmt extension. */ tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; continue; } l3 = pmap_l2_to_l3(l2, tmpva); if (l3 == NULL || ((l3e = pmap_load(l3)) & PTE_V) == 0) return (EINVAL); /* * TODO: Update the L3 entry if the attributes don't match once * we support the upcoming Svpbmt extension. */ tmpva += PAGE_SIZE; } return (0); } /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can * find other mappings. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { pt_entry_t *l2, *l3, tpte; vm_paddr_t pa; int val; bool managed; PMAP_LOCK(pmap); l2 = pmap_l2(pmap, addr); if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { if ((tpte & PTE_RWX) != 0) { pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); val = MINCORE_INCORE | MINCORE_PSIND(1); } else { l3 = pmap_l2_to_l3(l2, addr); tpte = pmap_load(l3); if ((tpte & PTE_V) == 0) { PMAP_UNLOCK(pmap); return (0); } pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); val = MINCORE_INCORE; } if ((tpte & PTE_D) != 0) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((tpte & PTE_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; } else { managed = false; val = 0; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { *pap = pa; } PMAP_UNLOCK(pmap); return (val); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; u_int hart; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (pmap == oldpmap) return; load_satp(pmap->pm_satp); hart = PCPU_GET(hart); #ifdef SMP CPU_SET_ATOMIC(hart, &pmap->pm_active); CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); #else CPU_SET(hart, &pmap->pm_active); CPU_CLR(hart, &oldpmap->pm_active); #endif PCPU_SET(curpmap, pmap); sfence_vma(); } void pmap_activate(struct thread *td) { critical_enter(); pmap_activate_sw(td); critical_exit(); } void pmap_activate_boot(pmap_t pmap) { u_int hart; hart = PCPU_GET(hart); #ifdef SMP CPU_SET_ATOMIC(hart, &pmap->pm_active); #else CPU_SET(hart, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { cpuset_t mask; /* * From the RISC-V User-Level ISA V2.2: * * "To make a store to instruction memory visible to all * RISC-V harts, the writing hart has to execute a data FENCE * before requesting that all remote RISC-V harts execute a * FENCE.I." * * However, this is slightly misleading; we still need to * perform a FENCE.I for the local hart, as FENCE does nothing * for its icache. FENCE.I alone is also sufficient for the * local hart. */ sched_pin(); mask = all_harts; CPU_CLR(PCPU_GET(hart), &mask); fence_i(); if (!CPU_EMPTY(&mask) && smp_started) { fence(); sbi_remote_fence_i(mask.__bits); } sched_unpin(); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= DMAP_MAX_PHYSADDR) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= DMAP_MAX_PHYSADDR) { panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); } } } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l1p, *l2p; /* Get l1 directory entry. */ l1p = pmap_l1(pmap, va); *l1 = l1p; if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) return (false); if ((pmap_load(l1p) & PTE_RX) != 0) { *l2 = NULL; *l3 = NULL; return (true); } /* Get l2 directory entry. */ l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) return (false); if ((pmap_load(l2p) & PTE_RX) != 0) { *l3 = NULL; return (true); } /* Get l3 page table entry. */ *l3 = pmap_l2_to_l3(l2p, va); return (true); } /* * Track a range of the kernel's virtual address space that is contiguous * in various mapping attributes. */ struct pmap_kernel_map_range { vm_offset_t sva; pt_entry_t attrs; int l3pages; int l2pages; int l1pages; }; static void sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t eva) { if (eva <= range->sva) return; sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", range->sva, eva, (range->attrs & PTE_W) == PTE_W ? 'w' : '-', (range->attrs & PTE_X) == PTE_X ? 'x' : '-', (range->attrs & PTE_U) == PTE_U ? 'u' : 's', (range->attrs & PTE_G) == PTE_G ? 'g' : '-', range->l1pages, range->l2pages, range->l3pages); /* Reset to sentinel value. */ range->sva = 0xfffffffffffffffful; } /* * Determine whether the attributes specified by a page table entry match those * being tracked by the current range. */ static bool sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) { return (range->attrs == attrs); } static void sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, pt_entry_t attrs) { memset(range, 0, sizeof(*range)); range->sva = va; range->attrs = attrs; } /* * Given a leaf PTE, derive the mapping's attributes. If they do not match * those of the current run, dump the address range and its attributes, and * begin a new run. */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) { pt_entry_t attrs; /* The PTE global bit is inherited by lower levels. */ attrs = l1e & PTE_G; if ((l1e & PTE_RWX) != 0) attrs |= l1e & (PTE_RWX | PTE_U); else if (l2e != 0) attrs |= l2e & PTE_G; if ((l2e & PTE_RWX) != 0) attrs |= l2e & (PTE_RWX | PTE_U); else if (l3e != 0) attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { sysctl_kmaps_dump(sb, range, va); sysctl_kmaps_reinit(range, va, attrs); } } static int sysctl_kmaps(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; pd_entry_t l1e, *l2, l2e; pt_entry_t *l3, l3e; vm_offset_t sva; vm_paddr_t pa; int error, i, j, k; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = &sbuf; sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ range.sva = 0xfffffffffffffffful; /* * Iterate over the kernel page tables without holding the kernel pmap * lock. Kernel page table pages are never freed, so at worst we will * observe inconsistencies in the output. */ sva = VM_MIN_KERNEL_ADDRESS; for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) sbuf_printf(sb, "\nDirect map:\n"); else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) sbuf_printf(sb, "\nKernel map:\n"); l1e = kernel_pmap->pm_l1[i]; if ((l1e & PTE_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); sva += L1_SIZE; continue; } if ((l1e & PTE_RWX) != 0) { sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); range.l1pages++; sva += L1_SIZE; continue; } pa = PTE_TO_PHYS(l1e); l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { l2e = l2[j]; if ((l2e & PTE_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); sva += L2_SIZE; continue; } if ((l2e & PTE_RWX) != 0) { sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); range.l2pages++; sva += L2_SIZE; continue; } pa = PTE_TO_PHYS(l2e); l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, sva += L3_SIZE) { l3e = l3[k]; if ((l3e & PTE_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); continue; } sysctl_kmaps_check(sb, &range, sva, l1e, l2e, l3e); range.l3pages++; } } } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, NULL, 0, sysctl_kmaps, "A", "Dump kernel address layout"); diff --git a/sys/riscv/riscv/uma_machdep.c b/sys/riscv/riscv/uma_machdep.c index f1725fde4699..0c8abacd0302 100644 --- a/sys/riscv/riscv/uma_machdep.c +++ b/sys/riscv/riscv/uma_machdep.c @@ -1,73 +1,71 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; - m = vm_page_alloc_domain(NULL, 0, domain, - malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); + m = vm_page_alloc_noobj_domain(domain, malloc2vm_flags(wait) | + VM_ALLOC_WIRED); if (m == NULL) return (NULL); pa = m->phys_addr; if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)PHYS_TO_DMAP(pa); - if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) - bzero(va, PAGE_SIZE); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = DMAP_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); vm_page_unwire_noq(m); vm_page_free(m); } diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 12cab490781b..a510295b3c65 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -1,5859 +1,5859 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2019 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * Copyright (c) 2004-2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uma_core.c Implementation of the Universal Memory allocator * * This allocator is intended to replace the multitude of similar object caches * in the standard FreeBSD kernel. The intent is to be flexible as well as * efficient. A primary design goal is to return unused memory to the rest of * the system. This will make the system as a whole more flexible due to the * ability to move memory to subsystems which most need it instead of leaving * pools of reserved memory unused. * * The basic ideas stem from similar slab/zone based allocators whose algorithms * are well known. * */ /* * TODO: * - Improve memory usage for large allocations * - Investigate cache size adjustments */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_param.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEBUG_MEMGUARD #include #endif #include #ifdef INVARIANTS #define UMA_ALWAYS_CTORDTOR 1 #else #define UMA_ALWAYS_CTORDTOR 0 #endif /* * This is the zone and keg from which all zones are spawned. */ static uma_zone_t kegs; static uma_zone_t zones; /* * On INVARIANTS builds, the slab contains a second bitset of the same size, * "dbg_bits", which is laid out immediately after us_free. */ #ifdef INVARIANTS #define SLAB_BITSETS 2 #else #define SLAB_BITSETS 1 #endif /* * These are the two zones from which all offpage uma_slab_ts are allocated. * * One zone is for slab headers that can represent a larger number of items, * making the slabs themselves more efficient, and the other zone is for * headers that are smaller and represent fewer items, making the headers more * efficient. */ #define SLABZONE_SIZE(setsize) \ (sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS) #define SLABZONE0_SETSIZE (PAGE_SIZE / 16) #define SLABZONE1_SETSIZE SLAB_MAX_SETSIZE #define SLABZONE0_SIZE SLABZONE_SIZE(SLABZONE0_SETSIZE) #define SLABZONE1_SIZE SLABZONE_SIZE(SLABZONE1_SETSIZE) static uma_zone_t slabzones[2]; /* * The initial hash tables come out of this zone so they can be allocated * prior to malloc coming up. */ static uma_zone_t hashzone; /* The boot-time adjusted value for cache line alignment. */ int uma_align_cache = 64 - 1; static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc"); /* * Are we allowed to allocate buckets? */ static int bucketdisable = 1; /* Linked list of all kegs in the system */ static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); /* Linked list of all cache-only zones in the system */ static LIST_HEAD(,uma_zone) uma_cachezones = LIST_HEAD_INITIALIZER(uma_cachezones); /* * Mutex for global lists: uma_kegs, uma_cachezones, and the per-keg list of * zones. */ static struct rwlock_padalign __exclusive_cache_line uma_rwlock; static struct sx uma_reclaim_lock; /* * First available virual address for boot time allocations. */ static vm_offset_t bootstart; static vm_offset_t bootmem; /* * kmem soft limit, initialized by uma_set_limit(). Ensure that early * allocations don't trigger a wakeup of the reclaim thread. */ unsigned long uma_kmem_limit = LONG_MAX; SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0, "UMA kernel memory soft limit"); unsigned long uma_kmem_total; SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, "UMA kernel memory usage"); /* Is the VM done starting up? */ static enum { BOOT_COLD, BOOT_KVA, BOOT_PCPU, BOOT_RUNNING, BOOT_SHUTDOWN, } booted = BOOT_COLD; /* * This is the handle used to schedule events that need to happen * outside of the allocation fast path. */ static struct callout uma_callout; #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ /* * This structure is passed as the zone ctor arg so that I don't have to create * a special allocation function just for zones. */ struct uma_zctor_args { const char *name; size_t size; uma_ctor ctor; uma_dtor dtor; uma_init uminit; uma_fini fini; uma_import import; uma_release release; void *arg; uma_keg_t keg; int align; uint32_t flags; }; struct uma_kctor_args { uma_zone_t zone; size_t size; uma_init uminit; uma_fini fini; int align; uint32_t flags; }; struct uma_bucket_zone { uma_zone_t ubz_zone; const char *ubz_name; int ubz_entries; /* Number of items it can hold. */ int ubz_maxsize; /* Maximum allocation size per-item. */ }; /* * Compute the actual number of bucket entries to pack them in power * of two sizes for more efficient space utilization. */ #define BUCKET_SIZE(n) \ (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) #define BUCKET_MAX BUCKET_SIZE(256) struct uma_bucket_zone bucket_zones[] = { /* Literal bucket sizes. */ { NULL, "2 Bucket", 2, 4096 }, { NULL, "4 Bucket", 4, 3072 }, { NULL, "8 Bucket", 8, 2048 }, { NULL, "16 Bucket", 16, 1024 }, /* Rounded down power of 2 sizes for efficiency. */ { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, { NULL, NULL, 0} }; /* * Flags and enumerations to be passed to internal functions. */ enum zfreeskip { SKIP_NONE = 0, SKIP_CNT = 0x00000001, SKIP_DTOR = 0x00010000, SKIP_FINI = 0x00020000, }; /* Prototypes.. */ void uma_startup1(vm_offset_t); void uma_startup2(void); static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void page_free(void *, vm_size_t, uint8_t); static void pcpu_page_free(void *, vm_size_t, uint8_t); static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_reclaim(uma_zone_t zone, bool, int); static bool bucket_cache_reclaim_domain(uma_zone_t, bool, bool, int); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static void keg_drain(uma_keg_t keg, int domain); static int zone_ctor(void *, int, void *, int); static void zone_dtor(void *, int, void *); static inline void item_dtor(uma_zone_t zone, void *item, int size, void *udata, enum zfreeskip skip); static int zero_init(void *, int, int); static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, int itemdomain, bool ws); static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *); static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *); static void zone_timeout(uma_zone_t zone, void *); static int hash_alloc(struct uma_hash *, u_int); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_shutdown(void); static void *zone_alloc_item(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static int zone_alloc_limit(uma_zone_t zone, int count, int flags); static void zone_free_limit(uma_zone_t zone, int count); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); static void bucket_zone_drain(int domain); static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); static size_t slab_sizeof(int nitems); static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags); static int zone_import(void *, void **, int, int, int); static void zone_release(void *, void **, int); static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int); static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int); static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS); static uint64_t uma_zone_get_allocs(uma_zone_t zone); static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Memory allocation debugging"); #ifdef INVARIANTS static uint64_t uma_keg_get_allocs(uma_keg_t zone); static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg); static bool uma_dbg_kskip(uma_keg_t keg, void *mem); static bool uma_dbg_zskip(uma_zone_t zone, void *mem); static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); static u_int dbg_divisor = 1; SYSCTL_UINT(_vm_debug, OID_AUTO, divisor, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0, "Debug & thrash every this item in memory allocator"); static counter_u64_t uma_dbg_cnt = EARLY_COUNTER; static counter_u64_t uma_skip_cnt = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD, &uma_dbg_cnt, "memory items debugged"); SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD, &uma_skip_cnt, "memory items skipped, not debugged"); #endif SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Universal Memory Allocator"); SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT, 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT, 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); static int zone_warnings = 1; SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, "Warn when UMA zones becomes full"); static int multipage_slabs = 1; TUNABLE_INT("vm.debug.uma_multipage_slabs", &multipage_slabs); SYSCTL_INT(_vm_debug, OID_AUTO, uma_multipage_slabs, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &multipage_slabs, 0, "UMA may choose larger slab sizes for better efficiency"); /* * Select the slab zone for an offpage slab with the given maximum item count. */ static inline uma_zone_t slabzone(int ipers) { return (slabzones[ipers > SLABZONE0_SETSIZE]); } /* * This routine checks to see whether or not it's safe to enable buckets. */ static void bucket_enable(void) { KASSERT(booted >= BOOT_KVA, ("Bucket enable before init")); bucketdisable = vm_page_count_min(); } /* * Initialize bucket_zones, the array of zones of buckets of various sizes. * * For each zone, calculate the memory required for each bucket, consisting * of the header and an array of pointers. */ static void bucket_init(void) { struct uma_bucket_zone *ubz; int size; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { size = roundup(sizeof(struct uma_bucket), sizeof(void *)); size += sizeof(void *) * ubz->ubz_entries; ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_FIRSTTOUCH); } } /* * Given a desired number of entries for a bucket, return the zone from which * to allocate the bucket. */ static struct uma_bucket_zone * bucket_zone_lookup(int entries) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_entries >= entries) return (ubz); ubz--; return (ubz); } static int bucket_select(int size) { struct uma_bucket_zone *ubz; ubz = &bucket_zones[0]; if (size > ubz->ubz_maxsize) return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); for (; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_maxsize < size) break; ubz--; return (ubz->ubz_entries); } static uma_bucket_t bucket_alloc(uma_zone_t zone, void *udata, int flags) { struct uma_bucket_zone *ubz; uma_bucket_t bucket; /* * Don't allocate buckets early in boot. */ if (__predict_false(booted < BOOT_KVA)) return (NULL); /* * To limit bucket recursion we store the original zone flags * in a cookie passed via zalloc_arg/zfree_arg. This allows the * NOVM flag to persist even through deep recursions. We also * store ZFLAG_BUCKET once we have recursed attempting to allocate * a bucket for a bucket zone so we do not allow infinite bucket * recursion. This cookie will even persist to frees of unused * buckets via the allocation path or bucket allocations in the * free path. */ if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; else { if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) return (NULL); udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); } if (((uintptr_t)udata & UMA_ZONE_VM) != 0) flags |= M_NOVM; ubz = bucket_zone_lookup(atomic_load_16(&zone->uz_bucket_size)); if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) ubz++; bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); if (bucket) { #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); #endif bucket->ub_cnt = 0; bucket->ub_entries = min(ubz->ubz_entries, zone->uz_bucket_size_max); bucket->ub_seq = SMR_SEQ_INVALID; CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p", zone->uz_name, zone, bucket); } return (bucket); } static void bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) { struct uma_bucket_zone *ubz; if (bucket->ub_cnt != 0) bucket_drain(zone, bucket); KASSERT(bucket->ub_cnt == 0, ("bucket_free: Freeing a non free bucket.")); KASSERT(bucket->ub_seq == SMR_SEQ_INVALID, ("bucket_free: Freeing an SMR bucket.")); if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; ubz = bucket_zone_lookup(bucket->ub_entries); uma_zfree_arg(ubz->ubz_zone, bucket, udata); } static void bucket_zone_drain(int domain) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) uma_zone_reclaim_domain(ubz->ubz_zone, UMA_RECLAIM_DRAIN, domain); } #ifdef KASAN _Static_assert(UMA_SMALLEST_UNIT % KASAN_SHADOW_SCALE == 0, "Base UMA allocation size not a multiple of the KASAN scale factor"); static void kasan_mark_item_valid(uma_zone_t zone, void *item) { void *pcpu_item; size_t sz, rsz; int i; if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0) return; sz = zone->uz_size; rsz = roundup2(sz, KASAN_SHADOW_SCALE); if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { kasan_mark(item, sz, rsz, KASAN_GENERIC_REDZONE); } else { pcpu_item = zpcpu_base_to_offset(item); for (i = 0; i <= mp_maxid; i++) kasan_mark(zpcpu_get_cpu(pcpu_item, i), sz, rsz, KASAN_GENERIC_REDZONE); } } static void kasan_mark_item_invalid(uma_zone_t zone, void *item) { void *pcpu_item; size_t sz; int i; if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0) return; sz = roundup2(zone->uz_size, KASAN_SHADOW_SCALE); if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { kasan_mark(item, 0, sz, KASAN_UMA_FREED); } else { pcpu_item = zpcpu_base_to_offset(item); for (i = 0; i <= mp_maxid; i++) kasan_mark(zpcpu_get_cpu(pcpu_item, i), 0, sz, KASAN_UMA_FREED); } } static void kasan_mark_slab_valid(uma_keg_t keg, void *mem) { size_t sz; if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) { sz = keg->uk_ppera * PAGE_SIZE; kasan_mark(mem, sz, sz, 0); } } static void kasan_mark_slab_invalid(uma_keg_t keg, void *mem) { size_t sz; if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) { if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0) sz = keg->uk_ppera * PAGE_SIZE; else sz = keg->uk_pgoff; kasan_mark(mem, 0, sz, KASAN_UMA_FREED); } } #else /* !KASAN */ static void kasan_mark_item_valid(uma_zone_t zone __unused, void *item __unused) { } static void kasan_mark_item_invalid(uma_zone_t zone __unused, void *item __unused) { } static void kasan_mark_slab_valid(uma_keg_t keg __unused, void *mem __unused) { } static void kasan_mark_slab_invalid(uma_keg_t keg __unused, void *mem __unused) { } #endif /* KASAN */ #ifdef KMSAN static inline void kmsan_mark_item_uninitialized(uma_zone_t zone, void *item) { void *pcpu_item; size_t sz; int i; if ((zone->uz_flags & (UMA_ZFLAG_CACHE | UMA_ZONE_SECONDARY | UMA_ZONE_MALLOC)) != 0) { /* * Cache zones should not be instrumented by default, as UMA * does not have enough information to do so correctly. * Consumers can mark items themselves if it makes sense to do * so. * * Items from secondary zones are initialized by the parent * zone and thus cannot safely be marked by UMA. * * malloc zones are handled directly by malloc(9) and friends, * since they can provide more precise origin tracking. */ return; } if (zone->uz_keg->uk_init != NULL) { /* * By definition, initialized items cannot be marked. The * best we can do is mark items from these zones after they * are freed to the keg. */ return; } sz = zone->uz_size; if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { kmsan_orig(item, sz, KMSAN_TYPE_UMA, KMSAN_RET_ADDR); kmsan_mark(item, sz, KMSAN_STATE_UNINIT); } else { pcpu_item = zpcpu_base_to_offset(item); for (i = 0; i <= mp_maxid; i++) { kmsan_orig(zpcpu_get_cpu(pcpu_item, i), sz, KMSAN_TYPE_UMA, KMSAN_RET_ADDR); kmsan_mark(zpcpu_get_cpu(pcpu_item, i), sz, KMSAN_STATE_INITED); } } } #else /* !KMSAN */ static inline void kmsan_mark_item_uninitialized(uma_zone_t zone __unused, void *item __unused) { } #endif /* KMSAN */ /* * Acquire the domain lock and record contention. */ static uma_zone_domain_t zone_domain_lock(uma_zone_t zone, int domain) { uma_zone_domain_t zdom; bool lockfail; zdom = ZDOM_GET(zone, domain); lockfail = false; if (ZDOM_OWNED(zdom)) lockfail = true; ZDOM_LOCK(zdom); /* This is unsynchronized. The counter does not need to be precise. */ if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max) zone->uz_bucket_size++; return (zdom); } /* * Search for the domain with the least cached items and return it if it * is out of balance with the preferred domain. */ static __noinline int zone_domain_lowest(uma_zone_t zone, int pref) { long least, nitems, prefitems; int domain; int i; prefitems = least = LONG_MAX; domain = 0; for (i = 0; i < vm_ndomains; i++) { nitems = ZDOM_GET(zone, i)->uzd_nitems; if (nitems < least) { domain = i; least = nitems; } if (domain == pref) prefitems = nitems; } if (prefitems < least * 2) return (pref); return (domain); } /* * Search for the domain with the most cached items and return it or the * preferred domain if it has enough to proceed. */ static __noinline int zone_domain_highest(uma_zone_t zone, int pref) { long most, nitems; int domain; int i; if (ZDOM_GET(zone, pref)->uzd_nitems > BUCKET_MAX) return (pref); most = 0; domain = 0; for (i = 0; i < vm_ndomains; i++) { nitems = ZDOM_GET(zone, i)->uzd_nitems; if (nitems > most) { domain = i; most = nitems; } } return (domain); } /* * Set the maximum imax value. */ static void zone_domain_imax_set(uma_zone_domain_t zdom, int nitems) { long old; old = zdom->uzd_imax; do { if (old >= nitems) return; } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0); /* * We are at new maximum, so do the last WSS update for the old * bimin and prepare to measure next allocation batch. */ if (zdom->uzd_wss < old - zdom->uzd_bimin) zdom->uzd_wss = old - zdom->uzd_bimin; zdom->uzd_bimin = nitems; } /* * Attempt to satisfy an allocation by retrieving a full bucket from one of the * zone's caches. If a bucket is found the zone is not locked on return. */ static uma_bucket_t zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim) { uma_bucket_t bucket; long cnt; int i; bool dtor = false; ZDOM_LOCK_ASSERT(zdom); if ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) == NULL) return (NULL); /* SMR Buckets can not be re-used until readers expire. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && bucket->ub_seq != SMR_SEQ_INVALID) { if (!smr_poll(zone->uz_smr, bucket->ub_seq, false)) return (NULL); bucket->ub_seq = SMR_SEQ_INVALID; dtor = (zone->uz_dtor != NULL) || UMA_ALWAYS_CTORDTOR; if (STAILQ_NEXT(bucket, ub_link) != NULL) zdom->uzd_seq = STAILQ_NEXT(bucket, ub_link)->ub_seq; } STAILQ_REMOVE_HEAD(&zdom->uzd_buckets, ub_link); KASSERT(zdom->uzd_nitems >= bucket->ub_cnt, ("%s: item count underflow (%ld, %d)", __func__, zdom->uzd_nitems, bucket->ub_cnt)); KASSERT(bucket->ub_cnt > 0, ("%s: empty bucket in bucket cache", __func__)); zdom->uzd_nitems -= bucket->ub_cnt; if (reclaim) { /* * Shift the bounds of the current WSS interval to avoid * perturbing the estimates. */ cnt = lmin(zdom->uzd_bimin, bucket->ub_cnt); atomic_subtract_long(&zdom->uzd_imax, cnt); zdom->uzd_bimin -= cnt; zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt); if (zdom->uzd_limin >= bucket->ub_cnt) { zdom->uzd_limin -= bucket->ub_cnt; } else { zdom->uzd_limin = 0; zdom->uzd_timin = 0; } } else if (zdom->uzd_bimin > zdom->uzd_nitems) { zdom->uzd_bimin = zdom->uzd_nitems; if (zdom->uzd_imin > zdom->uzd_nitems) zdom->uzd_imin = zdom->uzd_nitems; } ZDOM_UNLOCK(zdom); if (dtor) for (i = 0; i < bucket->ub_cnt; i++) item_dtor(zone, bucket->ub_bucket[i], zone->uz_size, NULL, SKIP_NONE); return (bucket); } /* * Insert a full bucket into the specified cache. The "ws" parameter indicates * whether the bucket's contents should be counted as part of the zone's working * set. The bucket may be freed if it exceeds the bucket limit. */ static void zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata, const bool ws) { uma_zone_domain_t zdom; /* We don't cache empty buckets. This can happen after a reclaim. */ if (bucket->ub_cnt == 0) goto out; zdom = zone_domain_lock(zone, domain); /* * Conditionally set the maximum number of items. */ zdom->uzd_nitems += bucket->ub_cnt; if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) { if (ws) { zone_domain_imax_set(zdom, zdom->uzd_nitems); } else { /* * Shift the bounds of the current WSS interval to * avoid perturbing the estimates. */ atomic_add_long(&zdom->uzd_imax, bucket->ub_cnt); zdom->uzd_imin += bucket->ub_cnt; zdom->uzd_bimin += bucket->ub_cnt; zdom->uzd_limin += bucket->ub_cnt; } if (STAILQ_EMPTY(&zdom->uzd_buckets)) zdom->uzd_seq = bucket->ub_seq; /* * Try to promote reuse of recently used items. For items * protected by SMR, try to defer reuse to minimize polling. */ if (bucket->ub_seq == SMR_SEQ_INVALID) STAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); else STAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); ZDOM_UNLOCK(zdom); return; } zdom->uzd_nitems -= bucket->ub_cnt; ZDOM_UNLOCK(zdom); out: bucket_free(zone, bucket, udata); } /* Pops an item out of a per-cpu cache bucket. */ static inline void * cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket) { void *item; CRITICAL_ASSERT(curthread); bucket->ucb_cnt--; item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt]; #ifdef INVARIANTS bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL; KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); #endif cache->uc_allocs++; return (item); } /* Pushes an item into a per-cpu cache bucket. */ static inline void cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item) { CRITICAL_ASSERT(curthread); KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL, ("uma_zfree: Freeing to non free bucket index.")); bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item; bucket->ucb_cnt++; cache->uc_frees++; } /* * Unload a UMA bucket from a per-cpu cache. */ static inline uma_bucket_t cache_bucket_unload(uma_cache_bucket_t bucket) { uma_bucket_t b; b = bucket->ucb_bucket; if (b != NULL) { MPASS(b->ub_entries == bucket->ucb_entries); b->ub_cnt = bucket->ucb_cnt; bucket->ucb_bucket = NULL; bucket->ucb_entries = bucket->ucb_cnt = 0; } return (b); } static inline uma_bucket_t cache_bucket_unload_alloc(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_allocbucket)); } static inline uma_bucket_t cache_bucket_unload_free(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_freebucket)); } static inline uma_bucket_t cache_bucket_unload_cross(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_crossbucket)); } /* * Load a bucket into a per-cpu cache bucket. */ static inline void cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b) { CRITICAL_ASSERT(curthread); MPASS(bucket->ucb_bucket == NULL); MPASS(b->ub_seq == SMR_SEQ_INVALID); bucket->ucb_bucket = b; bucket->ucb_cnt = b->ub_cnt; bucket->ucb_entries = b->ub_entries; } static inline void cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_allocbucket, b); } static inline void cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_freebucket, b); } #ifdef NUMA static inline void cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_crossbucket, b); } #endif /* * Copy and preserve ucb_spare. */ static inline void cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2) { b1->ucb_bucket = b2->ucb_bucket; b1->ucb_entries = b2->ucb_entries; b1->ucb_cnt = b2->ucb_cnt; } /* * Swap two cache buckets. */ static inline void cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2) { struct uma_cache_bucket b3; CRITICAL_ASSERT(curthread); cache_bucket_copy(&b3, b1); cache_bucket_copy(b1, b2); cache_bucket_copy(b2, &b3); } /* * Attempt to fetch a bucket from a zone on behalf of the current cpu cache. */ static uma_bucket_t cache_fetch_bucket(uma_zone_t zone, uma_cache_t cache, int domain) { uma_zone_domain_t zdom; uma_bucket_t bucket; /* * Avoid the lock if possible. */ zdom = ZDOM_GET(zone, domain); if (zdom->uzd_nitems == 0) return (NULL); if ((cache_uz_flags(cache) & UMA_ZONE_SMR) != 0 && !smr_poll(zone->uz_smr, zdom->uzd_seq, false)) return (NULL); /* * Check the zone's cache of buckets. */ zdom = zone_domain_lock(zone, domain); if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) return (bucket); ZDOM_UNLOCK(zdom); return (NULL); } static void zone_log_warning(uma_zone_t zone) { static const struct timeval warninterval = { 300, 0 }; if (!zone_warnings || zone->uz_warning == NULL) return; if (ratecheck(&zone->uz_ratecheck, &warninterval)) printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); } static inline void zone_maxaction(uma_zone_t zone) { if (zone->uz_maxaction.ta_func != NULL) taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); } /* * Routine called by timeout which is used to fire off some time interval * based calculations. (stats, hash size, etc.) * * Arguments: * arg Unused * * Returns: * Nothing */ static void uma_timeout(void *unused) { bucket_enable(); zone_foreach(zone_timeout, NULL); /* Reschedule this event */ callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); } /* * Update the working set size estimates for the zone's bucket cache. * The constants chosen here are somewhat arbitrary. */ static void zone_domain_update_wss(uma_zone_domain_t zdom) { long m; ZDOM_LOCK_ASSERT(zdom); MPASS(zdom->uzd_imax >= zdom->uzd_nitems); MPASS(zdom->uzd_nitems >= zdom->uzd_bimin); MPASS(zdom->uzd_bimin >= zdom->uzd_imin); /* * Estimate WSS as modified moving average of biggest allocation * batches for each period over few minutes (UMA_TIMEOUT of 20s). */ zdom->uzd_wss = lmax(zdom->uzd_wss * 3 / 4, zdom->uzd_imax - zdom->uzd_bimin); /* * Estimate longtime minimum item count as a combination of recent * minimum item count, adjusted by WSS for safety, and the modified * moving average over the last several hours (UMA_TIMEOUT of 20s). * timin measures time since limin tried to go negative, that means * we were dangerously close to or got out of cache. */ m = zdom->uzd_imin - zdom->uzd_wss; if (m >= 0) { if (zdom->uzd_limin >= m) zdom->uzd_limin = m; else zdom->uzd_limin = (m + zdom->uzd_limin * 255) / 256; zdom->uzd_timin++; } else { zdom->uzd_limin = 0; zdom->uzd_timin = 0; } /* To reduce period edge effects on WSS keep half of the imax. */ atomic_subtract_long(&zdom->uzd_imax, (zdom->uzd_imax - zdom->uzd_nitems + 1) / 2); zdom->uzd_imin = zdom->uzd_bimin = zdom->uzd_nitems; } /* * Routine to perform timeout driven calculations. This expands the * hashes and does per cpu statistics aggregation. * * Returns nothing. */ static void zone_timeout(uma_zone_t zone, void *unused) { uma_keg_t keg; u_int slabs, pages; if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) goto trim; keg = zone->uz_keg; /* * Hash zones are non-numa by definition so the first domain * is the only one present. */ KEG_LOCK(keg, 0); pages = keg->uk_domain[0].ud_pages; /* * Expand the keg hash table. * * This is done if the number of slabs is larger than the hash size. * What I'm trying to do here is completely reduce collisions. This * may be a little aggressive. Should I allow for two collisions max? */ if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; /* * This is so involved because allocating and freeing * while the keg lock is held will lead to deadlock. * I have to do everything in stages and check for * races. */ KEG_UNLOCK(keg, 0); ret = hash_alloc(&newhash, 1 << fls(slabs)); KEG_LOCK(keg, 0); if (ret) { if (hash_expand(&keg->uk_hash, &newhash)) { oldhash = keg->uk_hash; keg->uk_hash = newhash; } else oldhash = newhash; KEG_UNLOCK(keg, 0); hash_free(&oldhash); goto trim; } } KEG_UNLOCK(keg, 0); trim: /* Trim caches not used for a long time. */ for (int i = 0; i < vm_ndomains; i++) { if (bucket_cache_reclaim_domain(zone, false, false, i) && (zone->uz_flags & UMA_ZFLAG_CACHE) == 0) keg_drain(zone->uz_keg, i); } } /* * Allocate and zero fill the next sized hash table from the appropriate * backing store. * * Arguments: * hash A new hash structure with the old hash size in uh_hashsize * * Returns: * 1 on success and 0 on failure. */ static int hash_alloc(struct uma_hash *hash, u_int size) { size_t alloc; KASSERT(powerof2(size), ("hash size must be power of 2")); if (size > UMA_HASH_SIZE_INIT) { hash->uh_hashsize = size; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT); } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, UMA_ANYDOMAIN, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { bzero(hash->uh_slab_hash, alloc); hash->uh_hashmask = hash->uh_hashsize - 1; return (1); } return (0); } /* * Expands the hash table for HASH zones. This is done from zone_timeout * to reduce collisions. This must not be done in the regular allocation * path, otherwise, we can recurse on the vm while allocating pages. * * Arguments: * oldhash The hash you want to expand * newhash The hash structure for the new table * * Returns: * Nothing * * Discussion: */ static int hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_hash_slab_t slab; u_int hval; u_int idx; if (!newhash->uh_slab_hash) return (0); if (oldhash->uh_hashsize >= newhash->uh_hashsize) return (0); /* * I need to investigate hash algorithms for resizing without a * full rehash. */ for (idx = 0; idx < oldhash->uh_hashsize; idx++) while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) { slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]); LIST_REMOVE(slab, uhs_hlink); hval = UMA_HASH(newhash, slab->uhs_data); LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, uhs_hlink); } return (1); } /* * Free the hash bucket to the appropriate backing store. * * Arguments: * slab_hash The hash bucket we're freeing * hashsize The number of entries in that hash bucket * * Returns: * Nothing */ static void hash_free(struct uma_hash *hash) { if (hash->uh_slab_hash == NULL) return; if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); else free(hash->uh_slab_hash, M_UMAHASH); } /* * Frees all outstanding items in a bucket * * Arguments: * zone The zone to free to, must be unlocked. * bucket The free/alloc bucket with items. * * Returns: * Nothing */ static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { int i; if (bucket->ub_cnt == 0) return; if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && bucket->ub_seq != SMR_SEQ_INVALID) { smr_wait(zone->uz_smr, bucket->ub_seq); bucket->ub_seq = SMR_SEQ_INVALID; for (i = 0; i < bucket->ub_cnt; i++) item_dtor(zone, bucket->ub_bucket[i], zone->uz_size, NULL, SKIP_NONE); } if (zone->uz_fini) for (i = 0; i < bucket->ub_cnt; i++) { kasan_mark_item_valid(zone, bucket->ub_bucket[i]); zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); kasan_mark_item_invalid(zone, bucket->ub_bucket[i]); } zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); if (zone->uz_max_items > 0) zone_free_limit(zone, bucket->ub_cnt); #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt); #endif bucket->ub_cnt = 0; } /* * Drains the per cpu caches for a zone. * * NOTE: This may only be called while the zone is being torn down, and not * during normal operation. This is necessary in order that we do not have * to migrate CPUs to drain the per-CPU caches. * * Arguments: * zone The zone to drain, must be unlocked. * * Returns: * Nothing */ static void cache_drain(uma_zone_t zone) { uma_cache_t cache; uma_bucket_t bucket; smr_seq_t seq; int cpu; /* * XXX: It is safe to not lock the per-CPU caches, because we're * tearing down the zone anyway. I.e., there will be no further use * of the caches at this point. * * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). */ seq = SMR_SEQ_INVALID; if ((zone->uz_flags & UMA_ZONE_SMR) != 0) seq = smr_advance(zone->uz_smr); CPU_FOREACH(cpu) { cache = &zone->uz_cpu[cpu]; bucket = cache_bucket_unload_alloc(cache); if (bucket != NULL) bucket_free(zone, bucket, NULL); bucket = cache_bucket_unload_free(cache); if (bucket != NULL) { bucket->ub_seq = seq; bucket_free(zone, bucket, NULL); } bucket = cache_bucket_unload_cross(cache); if (bucket != NULL) { bucket->ub_seq = seq; bucket_free(zone, bucket, NULL); } } bucket_cache_reclaim(zone, true, UMA_ANYDOMAIN); } static void cache_shrink(uma_zone_t zone, void *unused) { if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; ZONE_LOCK(zone); zone->uz_bucket_size = (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2; ZONE_UNLOCK(zone); } static void cache_drain_safe_cpu(uma_zone_t zone, void *unused) { uma_cache_t cache; uma_bucket_t b1, b2, b3; int domain; if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; b1 = b2 = b3 = NULL; critical_enter(); cache = &zone->uz_cpu[curcpu]; domain = PCPU_GET(domain); b1 = cache_bucket_unload_alloc(cache); /* * Don't flush SMR zone buckets. This leaves the zone without a * bucket and forces every free to synchronize(). */ if ((zone->uz_flags & UMA_ZONE_SMR) == 0) { b2 = cache_bucket_unload_free(cache); b3 = cache_bucket_unload_cross(cache); } critical_exit(); if (b1 != NULL) zone_free_bucket(zone, b1, NULL, domain, false); if (b2 != NULL) zone_free_bucket(zone, b2, NULL, domain, false); if (b3 != NULL) { /* Adjust the domain so it goes to zone_free_cross. */ domain = (domain + 1) % vm_ndomains; zone_free_bucket(zone, b3, NULL, domain, false); } } /* * Safely drain per-CPU caches of a zone(s) to alloc bucket. * This is an expensive call because it needs to bind to all CPUs * one by one and enter a critical section on each of them in order * to safely access their cache buckets. * Zone lock must not be held on call this function. */ static void pcpu_cache_drain_safe(uma_zone_t zone) { int cpu; /* * Polite bucket sizes shrinking was not enough, shrink aggressively. */ if (zone) cache_shrink(zone, NULL); else zone_foreach(cache_shrink, NULL); CPU_FOREACH(cpu) { thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); if (zone) cache_drain_safe_cpu(zone, NULL); else zone_foreach(cache_drain_safe_cpu, NULL); } thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } /* * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller * requested a drain, otherwise the per-domain caches are trimmed to either * estimated working set size. */ static bool bucket_cache_reclaim_domain(uma_zone_t zone, bool drain, bool trim, int domain) { uma_zone_domain_t zdom; uma_bucket_t bucket; long target; bool done = false; /* * The cross bucket is partially filled and not part of * the item count. Reclaim it individually here. */ zdom = ZDOM_GET(zone, domain); if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) { ZONE_CROSS_LOCK(zone); bucket = zdom->uzd_cross; zdom->uzd_cross = NULL; ZONE_CROSS_UNLOCK(zone); if (bucket != NULL) bucket_free(zone, bucket, NULL); } /* * If we were asked to drain the zone, we are done only once * this bucket cache is empty. If trim, we reclaim items in * excess of the zone's estimated working set size. Multiple * consecutive calls will shrink the WSS and so reclaim more. * If neither drain nor trim, then voluntarily reclaim 1/4 * (to reduce first spike) of items not used for a long time. */ ZDOM_LOCK(zdom); zone_domain_update_wss(zdom); if (drain) target = 0; else if (trim) target = zdom->uzd_wss; else if (zdom->uzd_timin > 900 / UMA_TIMEOUT) target = zdom->uzd_nitems - zdom->uzd_limin / 4; else { ZDOM_UNLOCK(zdom); return (done); } while ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) != NULL && zdom->uzd_nitems >= target + bucket->ub_cnt) { bucket = zone_fetch_bucket(zone, zdom, true); if (bucket == NULL) break; bucket_free(zone, bucket, NULL); done = true; ZDOM_LOCK(zdom); } ZDOM_UNLOCK(zdom); return (done); } static void bucket_cache_reclaim(uma_zone_t zone, bool drain, int domain) { int i; /* * Shrink the zone bucket size to ensure that the per-CPU caches * don't grow too large. */ if (zone->uz_bucket_size > zone->uz_bucket_size_min) zone->uz_bucket_size--; if (domain != UMA_ANYDOMAIN && (zone->uz_flags & UMA_ZONE_ROUNDROBIN) == 0) { bucket_cache_reclaim_domain(zone, drain, true, domain); } else { for (i = 0; i < vm_ndomains; i++) bucket_cache_reclaim_domain(zone, drain, true, i); } } static void keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) { uint8_t *mem; size_t size; int i; uint8_t flags; CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes", keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera); mem = slab_data(slab, keg); size = PAGE_SIZE * keg->uk_ppera; kasan_mark_slab_valid(keg, mem); if (keg->uk_fini != NULL) { for (i = start - 1; i > -1; i--) #ifdef INVARIANTS /* * trash_fini implies that dtor was trash_dtor. trash_fini * would check that memory hasn't been modified since free, * which executed trash_dtor. * That's why we need to run uma_dbg_kskip() check here, * albeit we don't make skip check for other init/fini * invocations. */ if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) || keg->uk_fini != trash_fini) #endif keg->uk_fini(slab_item(slab, keg, i), keg->uk_size); } flags = slab->us_flags; if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab), NULL, SKIP_NONE); } keg->uk_freef(mem, size, flags); uma_total_dec(size); } static void keg_drain_domain(uma_keg_t keg, int domain) { struct slabhead freeslabs; uma_domain_t dom; uma_slab_t slab, tmp; uint32_t i, stofree, stokeep, partial; dom = &keg->uk_domain[domain]; LIST_INIT(&freeslabs); CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u", keg->uk_name, keg, domain, dom->ud_free_items); KEG_LOCK(keg, domain); /* * Are the free items in partially allocated slabs sufficient to meet * the reserve? If not, compute the number of fully free slabs that must * be kept. */ partial = dom->ud_free_items - dom->ud_free_slabs * keg->uk_ipers; if (partial < keg->uk_reserve) { stokeep = min(dom->ud_free_slabs, howmany(keg->uk_reserve - partial, keg->uk_ipers)); } else { stokeep = 0; } stofree = dom->ud_free_slabs - stokeep; /* * Partition the free slabs into two sets: those that must be kept in * order to maintain the reserve, and those that may be released back to * the system. Since one set may be much larger than the other, * populate the smaller of the two sets and swap them if necessary. */ for (i = min(stofree, stokeep); i > 0; i--) { slab = LIST_FIRST(&dom->ud_free_slab); LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&freeslabs, slab, us_link); } if (stofree > stokeep) LIST_SWAP(&freeslabs, &dom->ud_free_slab, uma_slab, us_link); if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) { LIST_FOREACH(slab, &freeslabs, us_link) UMA_HASH_REMOVE(&keg->uk_hash, slab); } dom->ud_free_items -= stofree * keg->uk_ipers; dom->ud_free_slabs -= stofree; dom->ud_pages -= stofree * keg->uk_ppera; KEG_UNLOCK(keg, domain); LIST_FOREACH_SAFE(slab, &freeslabs, us_link, tmp) keg_free_slab(keg, slab, keg->uk_ipers); } /* * Frees pages from a keg back to the system. This is done on demand from * the pageout daemon. * * Returns nothing. */ static void keg_drain(uma_keg_t keg, int domain) { int i; if ((keg->uk_flags & UMA_ZONE_NOFREE) != 0) return; if (domain != UMA_ANYDOMAIN) { keg_drain_domain(keg, domain); } else { for (i = 0; i < vm_ndomains; i++) keg_drain_domain(keg, i); } } static void zone_reclaim(uma_zone_t zone, int domain, int waitok, bool drain) { /* * Count active reclaim operations in order to interlock with * zone_dtor(), which removes the zone from global lists before * attempting to reclaim items itself. * * The zone may be destroyed while sleeping, so only zone_dtor() should * specify M_WAITOK. */ ZONE_LOCK(zone); if (waitok == M_WAITOK) { while (zone->uz_reclaimers > 0) msleep(zone, ZONE_LOCKPTR(zone), PVM, "zonedrain", 1); } zone->uz_reclaimers++; ZONE_UNLOCK(zone); bucket_cache_reclaim(zone, drain, domain); if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) keg_drain(zone->uz_keg, domain); ZONE_LOCK(zone); zone->uz_reclaimers--; if (zone->uz_reclaimers == 0) wakeup(zone); ZONE_UNLOCK(zone); } static void zone_drain(uma_zone_t zone, void *arg) { int domain; domain = (int)(uintptr_t)arg; zone_reclaim(zone, domain, M_NOWAIT, true); } static void zone_trim(uma_zone_t zone, void *arg) { int domain; domain = (int)(uintptr_t)arg; zone_reclaim(zone, domain, M_NOWAIT, false); } /* * Allocate a new slab for a keg and inserts it into the partial slab list. * The keg should be unlocked on entry. If the allocation succeeds it will * be locked on return. * * Arguments: * flags Wait flags for the item initialization routine * aflags Wait flags for the slab allocation * * Returns: * The slab that was allocated or NULL if there is no memory and the * caller specified M_NOWAIT. */ static uma_slab_t keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags, int aflags) { uma_domain_t dom; uma_slab_t slab; unsigned long size; uint8_t *mem; uint8_t sflags; int i; KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_alloc_slab: domain %d out of range", domain)); slab = NULL; mem = NULL; if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { uma_hash_slab_t hslab; hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL, domain, aflags); if (hslab == NULL) goto fail; slab = &hslab->uhs_slab; } /* * This reproduces the old vm_zone behavior of zero filling pages the * first time they are added to a zone. * * Malloced items are zeroed in uma_zalloc. */ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) aflags |= M_ZERO; else aflags &= ~M_ZERO; if (keg->uk_flags & UMA_ZONE_NODUMP) aflags |= M_NODUMP; /* zone is passed for legacy reasons. */ size = keg->uk_ppera * PAGE_SIZE; mem = keg->uk_allocf(zone, size, domain, &sflags, aflags); if (mem == NULL) { if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab), NULL, SKIP_NONE); goto fail; } uma_total_inc(size); /* For HASH zones all pages go to the same uma_domain. */ if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) domain = 0; /* Point the slab into the allocated memory */ if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) slab = (uma_slab_t)(mem + keg->uk_pgoff); else slab_tohashslab(slab)->uhs_data = mem; if (keg->uk_flags & UMA_ZFLAG_VTOSLAB) for (i = 0; i < keg->uk_ppera; i++) vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE), zone, slab); slab->us_freecount = keg->uk_ipers; slab->us_flags = sflags; slab->us_domain = domain; BIT_FILL(keg->uk_ipers, &slab->us_free); #ifdef INVARIANTS BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg)); #endif if (keg->uk_init != NULL) { for (i = 0; i < keg->uk_ipers; i++) if (keg->uk_init(slab_item(slab, keg, i), keg->uk_size, flags) != 0) break; if (i != keg->uk_ipers) { keg_free_slab(keg, slab, i); goto fail; } } kasan_mark_slab_invalid(keg, mem); KEG_LOCK(keg, domain); CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)", slab, keg->uk_name, keg); if (keg->uk_flags & UMA_ZFLAG_HASH) UMA_HASH_INSERT(&keg->uk_hash, slab, mem); /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove * at least one item. */ dom = &keg->uk_domain[domain]; LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); dom->ud_pages += keg->uk_ppera; dom->ud_free_items += keg->uk_ipers; return (slab); fail: return (NULL); } /* * This function is intended to be used early on in place of page_alloc(). It * performs contiguous physical memory allocations and uses a bump allocator for * KVA, so is usable before the kernel map is initialized. */ static void * startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { vm_paddr_t pa; vm_page_t m; void *mem; int pages; int i; pages = howmany(bytes, PAGE_SIZE); KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__)); *pflag = UMA_SLAB_BOOT; m = vm_page_alloc_contig_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, pages, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT); if (m == NULL) return (NULL); pa = VM_PAGE_TO_PHYS(m); for (i = 0; i < pages; i++, pa += PAGE_SIZE) { #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) || defined(__powerpc64__) if ((wait & M_NODUMP) == 0) dump_add_page(pa); #endif } /* Allocate KVA and indirectly advance bootmem. */ mem = (void *)pmap_map(&bootmem, m->phys_addr, m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE); if ((wait & M_ZERO) != 0) bzero(mem, pages * PAGE_SIZE); return (mem); } static void startup_free(void *mem, vm_size_t bytes) { vm_offset_t va; vm_page_t m; va = (vm_offset_t)mem; m = PHYS_TO_VM_PAGE(pmap_kextract(va)); /* * startup_alloc() returns direct-mapped slabs on some platforms. Avoid * unmapping ranges of the direct map. */ if (va >= bootstart && va + bytes <= bootmem) pmap_remove(kernel_pmap, va, va + bytes); for (; bytes != 0; bytes -= PAGE_SIZE, m++) { #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) || defined(__powerpc64__) dump_drop_page(VM_PAGE_TO_PHYS(m)); #endif vm_page_unwire_noq(m); vm_page_free(m); } } /* * Allocates a number of pages from the system * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KERNEL; p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait); return (p); } static void * pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { struct pglist alloctail; vm_offset_t addr, zkva; int cpu, flags; vm_page_t p, p_next; #ifdef NUMA struct pcpu *pc; #endif MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE); TAILQ_INIT(&alloctail); - flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | - malloc2vm_flags(wait); + flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | malloc2vm_flags(wait); *pflag = UMA_SLAB_KERNEL; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) { - p = vm_page_alloc(NULL, 0, flags); + p = vm_page_alloc_noobj(flags); } else { #ifndef NUMA - p = vm_page_alloc(NULL, 0, flags); + p = vm_page_alloc_noobj(flags); #else pc = pcpu_find(cpu); if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain))) p = NULL; else - p = vm_page_alloc_domain(NULL, 0, - pc->pc_domain, flags); + p = vm_page_alloc_noobj_domain(pc->pc_domain, + flags); if (__predict_false(p == NULL)) - p = vm_page_alloc(NULL, 0, flags); + p = vm_page_alloc_noobj(flags); #endif } if (__predict_false(p == NULL)) goto fail; TAILQ_INSERT_TAIL(&alloctail, p, listq); } if ((addr = kva_alloc(bytes)) == 0) goto fail; zkva = addr; TAILQ_FOREACH(p, &alloctail, listq) { pmap_qenter(zkva, &p, 1); zkva += PAGE_SIZE; } return ((void*)addr); fail: TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { vm_page_unwire_noq(p); vm_page_free(p); } return (NULL); } /* * Allocates a number of pages from within an object * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { TAILQ_HEAD(, vm_page) alloctail; u_long npages; vm_offset_t retkva, zkva; vm_page_t p, p_next; uma_keg_t keg; + int req; TAILQ_INIT(&alloctail); keg = zone->uz_keg; + req = VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED; + if ((wait & M_WAITOK) != 0) + req |= VM_ALLOC_WAITOK; npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { - p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT | - VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | - ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : - VM_ALLOC_NOWAIT)); + p = vm_page_alloc_noobj_domain(domain, req); if (p != NULL) { /* * Since the page does not belong to an object, its * listq is unused. */ TAILQ_INSERT_TAIL(&alloctail, p, listq); npages--; continue; } /* * Page allocation failed, free intermediate pages and * exit. */ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { vm_page_unwire_noq(p); vm_page_free(p); } return (NULL); } *flags = UMA_SLAB_PRIV; zkva = keg->uk_kva + atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); retkva = zkva; TAILQ_FOREACH(p, &alloctail, listq) { pmap_qenter(zkva, &p, 1); zkva += PAGE_SIZE; } return ((void *)retkva); } /* * Allocate physically contiguous pages. */ static void * contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { *pflag = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } /* * Frees a number of pages to the system * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void page_free(void *mem, vm_size_t size, uint8_t flags) { if ((flags & UMA_SLAB_BOOT) != 0) { startup_free(mem, size); return; } KASSERT((flags & UMA_SLAB_KERNEL) != 0, ("UMA: page_free used with invalid flags %x", flags)); kmem_free((vm_offset_t)mem, size); } /* * Frees pcpu zone allocations * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) { vm_offset_t sva, curva; vm_paddr_t paddr; vm_page_t m; MPASS(size == (mp_maxid+1)*PAGE_SIZE); if ((flags & UMA_SLAB_BOOT) != 0) { startup_free(mem, size); return; } sva = (vm_offset_t)mem; for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { paddr = pmap_kextract(curva); m = PHYS_TO_VM_PAGE(paddr); vm_page_unwire_noq(m); vm_page_free(m); } pmap_qremove(sva, size >> PAGE_SHIFT); kva_free(sva, size); } /* * Zero fill initializer * * Arguments/Returns follow uma_init specifications */ static int zero_init(void *mem, int size, int flags) { bzero(mem, size); return (0); } #ifdef INVARIANTS static struct noslabbits * slab_dbg_bits(uma_slab_t slab, uma_keg_t keg) { return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers))); } #endif /* * Actual size of embedded struct slab (!OFFPAGE). */ static size_t slab_sizeof(int nitems) { size_t s; s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS; return (roundup(s, UMA_ALIGN_PTR + 1)); } #define UMA_FIXPT_SHIFT 31 #define UMA_FRAC_FIXPT(n, d) \ ((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d))) #define UMA_FIXPT_PCT(f) \ ((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT)) #define UMA_PCT_FIXPT(pct) UMA_FRAC_FIXPT((pct), 100) #define UMA_MIN_EFF UMA_PCT_FIXPT(100 - UMA_MAX_WASTE) /* * Compute the number of items that will fit in a slab. If hdr is true, the * item count may be limited to provide space in the slab for an inline slab * header. Otherwise, all slab space will be provided for item storage. */ static u_int slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr) { u_int ipers; u_int padpi; /* The padding between items is not needed after the last item. */ padpi = rsize - size; if (hdr) { /* * Start with the maximum item count and remove items until * the slab header first alongside the allocatable memory. */ for (ipers = MIN(SLAB_MAX_SETSIZE, (slabsize + padpi - slab_sizeof(1)) / rsize); ipers > 0 && ipers * rsize - padpi + slab_sizeof(ipers) > slabsize; ipers--) continue; } else { ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE); } return (ipers); } struct keg_layout_result { u_int format; u_int slabsize; u_int ipers; u_int eff; }; static void keg_layout_one(uma_keg_t keg, u_int rsize, u_int slabsize, u_int fmt, struct keg_layout_result *kl) { u_int total; kl->format = fmt; kl->slabsize = slabsize; /* Handle INTERNAL as inline with an extra page. */ if ((fmt & UMA_ZFLAG_INTERNAL) != 0) { kl->format &= ~UMA_ZFLAG_INTERNAL; kl->slabsize += PAGE_SIZE; } kl->ipers = slab_ipers_hdr(keg->uk_size, rsize, kl->slabsize, (fmt & UMA_ZFLAG_OFFPAGE) == 0); /* Account for memory used by an offpage slab header. */ total = kl->slabsize; if ((fmt & UMA_ZFLAG_OFFPAGE) != 0) total += slabzone(kl->ipers)->uz_keg->uk_rsize; kl->eff = UMA_FRAC_FIXPT(kl->ipers * rsize, total); } /* * Determine the format of a uma keg. This determines where the slab header * will be placed (inline or offpage) and calculates ipers, rsize, and ppera. * * Arguments * keg The zone we should initialize * * Returns * Nothing */ static void keg_layout(uma_keg_t keg) { struct keg_layout_result kl = {}, kl_tmp; u_int fmts[2]; u_int alignsize; u_int nfmt; u_int pages; u_int rsize; u_int slabsize; u_int i, j; KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || (keg->uk_size <= UMA_PCPU_ALLOC_SIZE && (keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0), ("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b", __func__, keg->uk_name, keg->uk_size, keg->uk_flags, PRINT_UMA_ZFLAGS)); KASSERT((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) == 0 || (keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0, ("%s: incompatible flags 0x%b", __func__, keg->uk_flags, PRINT_UMA_ZFLAGS)); alignsize = keg->uk_align + 1; #ifdef KASAN /* * ASAN requires that each allocation be aligned to the shadow map * scale factor. */ if (alignsize < KASAN_SHADOW_SCALE) alignsize = KASAN_SHADOW_SCALE; #endif /* * Calculate the size of each allocation (rsize) according to * alignment. If the requested size is smaller than we have * allocation bits for we round it up. */ rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT); rsize = roundup2(rsize, alignsize); if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) { /* * We want one item to start on every align boundary in a page. * To do this we will span pages. We will also extend the item * by the size of align if it is an even multiple of align. * Otherwise, it would fall on the same boundary every time. */ if ((rsize & alignsize) == 0) rsize += alignsize; slabsize = rsize * (PAGE_SIZE / alignsize); slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE); slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE); slabsize = round_page(slabsize); } else { /* * Start with a slab size of as many pages as it takes to * represent a single item. We will try to fit as many * additional items into the slab as possible. */ slabsize = round_page(keg->uk_size); } /* Build a list of all of the available formats for this keg. */ nfmt = 0; /* Evaluate an inline slab layout. */ if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0) fmts[nfmt++] = 0; /* TODO: vm_page-embedded slab. */ /* * We can't do OFFPAGE if we're internal or if we've been * asked to not go to the VM for buckets. If we do this we * may end up going to the VM for slabs which we do not want * to do if we're UMA_ZONE_VM, which clearly forbids it. * In those cases, evaluate a pseudo-format called INTERNAL * which has an inline slab header and one extra page to * guarantee that it fits. * * Otherwise, see if using an OFFPAGE slab will improve our * efficiency. */ if ((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) != 0) fmts[nfmt++] = UMA_ZFLAG_INTERNAL; else fmts[nfmt++] = UMA_ZFLAG_OFFPAGE; /* * Choose a slab size and format which satisfy the minimum efficiency. * Prefer the smallest slab size that meets the constraints. * * Start with a minimum slab size, to accommodate CACHESPREAD. Then, * for small items (up to PAGE_SIZE), the iteration increment is one * page; and for large items, the increment is one item. */ i = (slabsize + rsize - keg->uk_size) / MAX(PAGE_SIZE, rsize); KASSERT(i >= 1, ("keg %s(%p) flags=0x%b slabsize=%u, rsize=%u, i=%u", keg->uk_name, keg, keg->uk_flags, PRINT_UMA_ZFLAGS, slabsize, rsize, i)); for ( ; ; i++) { slabsize = (rsize <= PAGE_SIZE) ? ptoa(i) : round_page(rsize * (i - 1) + keg->uk_size); for (j = 0; j < nfmt; j++) { /* Only if we have no viable format yet. */ if ((fmts[j] & UMA_ZFLAG_INTERNAL) != 0 && kl.ipers > 0) continue; keg_layout_one(keg, rsize, slabsize, fmts[j], &kl_tmp); if (kl_tmp.eff <= kl.eff) continue; kl = kl_tmp; CTR6(KTR_UMA, "keg %s layout: format %#x " "(ipers %u * rsize %u) / slabsize %#x = %u%% eff", keg->uk_name, kl.format, kl.ipers, rsize, kl.slabsize, UMA_FIXPT_PCT(kl.eff)); /* Stop when we reach the minimum efficiency. */ if (kl.eff >= UMA_MIN_EFF) break; } if (kl.eff >= UMA_MIN_EFF || !multipage_slabs || slabsize >= SLAB_MAX_SETSIZE * rsize || (keg->uk_flags & (UMA_ZONE_PCPU | UMA_ZONE_CONTIG)) != 0) break; } pages = atop(kl.slabsize); if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) pages *= mp_maxid + 1; keg->uk_rsize = rsize; keg->uk_ipers = kl.ipers; keg->uk_ppera = pages; keg->uk_flags |= kl.format; /* * How do we find the slab header if it is offpage or if not all item * start addresses are in the same page? We could solve the latter * case with vaddr alignment, but we don't. */ if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0 || (keg->uk_ipers - 1) * rsize >= PAGE_SIZE) { if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0) keg->uk_flags |= UMA_ZFLAG_HASH; else keg->uk_flags |= UMA_ZFLAG_VTOSLAB; } CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u", __func__, keg->uk_name, keg->uk_flags, rsize, keg->uk_ipers, pages); KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE, ("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__, keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize, keg->uk_ipers, pages)); } /* * Keg header ctor. This initializes all fields, locks, etc. And inserts * the keg onto the global keg list. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_kctor_args */ static int keg_ctor(void *mem, int size, void *udata, int flags) { struct uma_kctor_args *arg = udata; uma_keg_t keg = mem; uma_zone_t zone; int i; bzero(keg, size); keg->uk_size = arg->size; keg->uk_init = arg->uminit; keg->uk_fini = arg->fini; keg->uk_align = arg->align; keg->uk_reserve = 0; keg->uk_flags = arg->flags; /* * We use a global round-robin policy by default. Zones with * UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which * case the iterator is never run. */ keg->uk_dr.dr_policy = DOMAINSET_RR(); keg->uk_dr.dr_iter = 0; /* * The primary zone is passed to us at keg-creation time. */ zone = arg->zone; keg->uk_name = zone->uz_name; if (arg->flags & UMA_ZONE_ZINIT) keg->uk_init = zero_init; if (arg->flags & UMA_ZONE_MALLOC) keg->uk_flags |= UMA_ZFLAG_VTOSLAB; #ifndef SMP keg->uk_flags &= ~UMA_ZONE_PCPU; #endif keg_layout(keg); /* * Use a first-touch NUMA policy for kegs that pmap_extract() will * work on. Use round-robin for everything else. * * Zones may override the default by specifying either. */ #ifdef NUMA if ((keg->uk_flags & (UMA_ZONE_ROUNDROBIN | UMA_ZFLAG_CACHE | UMA_ZONE_NOTPAGE)) == 0) keg->uk_flags |= UMA_ZONE_FIRSTTOUCH; else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0) keg->uk_flags |= UMA_ZONE_ROUNDROBIN; #endif /* * If we haven't booted yet we need allocations to go through the * startup cache until the vm is ready. */ #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera == 1) keg->uk_allocf = uma_small_alloc; else #endif if (booted < BOOT_KVA) keg->uk_allocf = startup_alloc; else if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_allocf = pcpu_page_alloc; else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1) keg->uk_allocf = contig_alloc; else keg->uk_allocf = page_alloc; #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera == 1) keg->uk_freef = uma_small_free; else #endif if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_freef = pcpu_page_free; else keg->uk_freef = page_free; /* * Initialize keg's locks. */ for (i = 0; i < vm_ndomains; i++) KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS)); /* * If we're putting the slab header in the actual page we need to * figure out where in each page it goes. See slab_sizeof * definition. */ if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) { size_t shsize; shsize = slab_sizeof(keg->uk_ipers); keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize; /* * The only way the following is possible is if with our * UMA_ALIGN_PTR adjustments we are now bigger than * UMA_SLAB_SIZE. I haven't checked whether this is * mathematically possible for all cases, so we make * sure here anyway. */ KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera, ("zone %s ipers %d rsize %d size %d slab won't fit", zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size)); } if (keg->uk_flags & UMA_ZFLAG_HASH) hash_alloc(&keg->uk_hash, 0); CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone); LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); rw_wunlock(&uma_rwlock); return (0); } static void zone_kva_available(uma_zone_t zone, void *unused) { uma_keg_t keg; if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return; KEG_GET(zone, keg); if (keg->uk_allocf == startup_alloc) { /* Switch to the real allocator. */ if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_allocf = pcpu_page_alloc; else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1) keg->uk_allocf = contig_alloc; else keg->uk_allocf = page_alloc; } } static void zone_alloc_counters(uma_zone_t zone, void *unused) { zone->uz_allocs = counter_u64_alloc(M_WAITOK); zone->uz_frees = counter_u64_alloc(M_WAITOK); zone->uz_fails = counter_u64_alloc(M_WAITOK); zone->uz_xdomain = counter_u64_alloc(M_WAITOK); } static void zone_alloc_sysctl(uma_zone_t zone, void *unused) { uma_zone_domain_t zdom; uma_domain_t dom; uma_keg_t keg; struct sysctl_oid *oid, *domainoid; int domains, i, cnt; static const char *nokeg = "cache zone"; char *c; /* * Make a sysctl safe copy of the zone name by removing * any special characters and handling dups by appending * an index. */ if (zone->uz_namecnt != 0) { /* Count the number of decimal digits and '_' separator. */ for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++) cnt /= 10; zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1, M_UMA, M_WAITOK); sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name, zone->uz_namecnt); } else zone->uz_ctlname = strdup(zone->uz_name, M_UMA); for (c = zone->uz_ctlname; *c != '\0'; c++) if (strchr("./\\ -", *c) != NULL) *c = '_'; /* * Basic parameters at the root. */ zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma), OID_AUTO, zone->uz_ctlname, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); oid = zone->uz_oid; SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_flags, "A", "Allocator configuration flags"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0, "Desired per-cpu cache size"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0, "Maximum allowed per-cpu cache size"); /* * keg if present. */ if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) domains = vm_ndomains; else domains = 1; oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "keg", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); keg = zone->uz_keg; if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) { SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "name", CTLFLAG_RD, keg->uk_name, "Keg name"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "rsize", CTLFLAG_RD, &keg->uk_rsize, 0, "Real object size with alignment"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "ppera", CTLFLAG_RD, &keg->uk_ppera, 0, "pages per-slab allocation"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "ipers", CTLFLAG_RD, &keg->uk_ipers, 0, "items available per-slab"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "align", CTLFLAG_RD, &keg->uk_align, 0, "item alignment mask"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "reserve", CTLFLAG_RD, &keg->uk_reserve, 0, "number of reserved items"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, keg, 0, sysctl_handle_uma_slab_efficiency, "I", "Slab utilization (100 - internal fragmentation %)"); domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); for (i = 0; i < domains; i++) { dom = &keg->uk_domain[i]; oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "pages", CTLFLAG_RD, &dom->ud_pages, 0, "Total pages currently allocated from VM"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_items", CTLFLAG_RD, &dom->ud_free_items, 0, "Items free in the slab layer"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_slabs", CTLFLAG_RD, &dom->ud_free_slabs, 0, "Unused slabs"); } } else SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "name", CTLFLAG_RD, nokeg, "Keg name"); /* * Information about zone limits. */ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "limit", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_items, "QU", "Current number of allocated items if limit is set"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "max_items", CTLFLAG_RD, &zone->uz_max_items, 0, "Maximum number of allocated and cached items"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0, "Number of threads sleeping at limit"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0, "Total zone limit sleeps"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_max", CTLFLAG_RD, &zone->uz_bucket_max, 0, "Maximum number of items in each domain's bucket cache"); /* * Per-domain zone information. */ domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); for (i = 0; i < domains; i++) { zdom = ZDOM_GET(zone, i); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "nitems", CTLFLAG_RD, &zdom->uzd_nitems, "number of items in this domain"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "imax", CTLFLAG_RD, &zdom->uzd_imax, "maximum item count in this period"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "imin", CTLFLAG_RD, &zdom->uzd_imin, "minimum item count in this period"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bimin", CTLFLAG_RD, &zdom->uzd_bimin, "Minimum item count in this batch"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wss", CTLFLAG_RD, &zdom->uzd_wss, "Working set size"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "limin", CTLFLAG_RD, &zdom->uzd_limin, "Long time minimum item count"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "timin", CTLFLAG_RD, &zdom->uzd_timin, 0, "Time since zero long time minimum item count"); } /* * General statistics. */ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, zone, 1, sysctl_handle_uma_zone_cur, "I", "Current number of allocated items"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_allocs, "QU", "Total allocation calls"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_frees, "QU", "Total free calls"); SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "fails", CTLFLAG_RD, &zone->uz_fails, "Number of allocation failures"); SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "xdomain", CTLFLAG_RD, &zone->uz_xdomain, "Free calls from the wrong domain"); } struct uma_zone_count { const char *name; int count; }; static void zone_count(uma_zone_t zone, void *arg) { struct uma_zone_count *cnt; cnt = arg; /* * Some zones are rapidly created with identical names and * destroyed out of order. This can lead to gaps in the count. * Use one greater than the maximum observed for this name. */ if (strcmp(zone->uz_name, cnt->name) == 0) cnt->count = MAX(cnt->count, zone->uz_namecnt + 1); } static void zone_update_caches(uma_zone_t zone) { int i; for (i = 0; i <= mp_maxid; i++) { cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size); cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags); } } /* * Zone header ctor. This initializes all fields, locks, etc. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_zctor_args */ static int zone_ctor(void *mem, int size, void *udata, int flags) { struct uma_zone_count cnt; struct uma_zctor_args *arg = udata; uma_zone_domain_t zdom; uma_zone_t zone = mem; uma_zone_t z; uma_keg_t keg; int i; bzero(zone, size); zone->uz_name = arg->name; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; zone->uz_init = NULL; zone->uz_fini = NULL; zone->uz_sleeps = 0; zone->uz_bucket_size = 0; zone->uz_bucket_size_min = 0; zone->uz_bucket_size_max = BUCKET_MAX; zone->uz_flags = (arg->flags & UMA_ZONE_SMR); zone->uz_warning = NULL; /* The domain structures follow the cpu structures. */ zone->uz_bucket_max = ULONG_MAX; timevalclear(&zone->uz_ratecheck); /* Count the number of duplicate names. */ cnt.name = arg->name; cnt.count = 0; zone_foreach(zone_count, &cnt); zone->uz_namecnt = cnt.count; ZONE_CROSS_LOCK_INIT(zone); for (i = 0; i < vm_ndomains; i++) { zdom = ZDOM_GET(zone, i); ZDOM_LOCK_INIT(zone, zdom, (arg->flags & UMA_ZONE_MTXCLASS)); STAILQ_INIT(&zdom->uzd_buckets); } #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN) if (arg->uminit == trash_init && arg->fini == trash_fini) zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR; #elif defined(KASAN) if ((arg->flags & (UMA_ZONE_NOFREE | UMA_ZFLAG_CACHE)) != 0) arg->flags |= UMA_ZONE_NOKASAN; #endif /* * This is a pure cache zone, no kegs. */ if (arg->import) { KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0, ("zone_ctor: Import specified for non-cache zone.")); zone->uz_flags = arg->flags; zone->uz_size = arg->size; zone->uz_import = arg->import; zone->uz_release = arg->release; zone->uz_arg = arg->arg; #ifdef NUMA /* * Cache zones are round-robin unless a policy is * specified because they may have incompatible * constraints. */ if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0) zone->uz_flags |= UMA_ZONE_ROUNDROBIN; #endif rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); rw_wunlock(&uma_rwlock); goto out; } /* * Use the regular zone/keg/slab allocator. */ zone->uz_import = zone_import; zone->uz_release = zone_release; zone->uz_arg = zone; keg = arg->keg; if (arg->flags & UMA_ZONE_SECONDARY) { KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, ("Secondary zone requested UMA_ZFLAG_INTERNAL")); KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); zone->uz_init = arg->uminit; zone->uz_fini = arg->fini; zone->uz_flags |= UMA_ZONE_SECONDARY; rw_wlock(&uma_rwlock); ZONE_LOCK(zone); LIST_FOREACH(z, &keg->uk_zones, uz_link) { if (LIST_NEXT(z, uz_link) == NULL) { LIST_INSERT_AFTER(z, zone, uz_link); break; } } ZONE_UNLOCK(zone); rw_wunlock(&uma_rwlock); } else if (keg == NULL) { if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, arg->align, arg->flags)) == NULL) return (ENOMEM); } else { struct uma_kctor_args karg; int error; /* We should only be here from uma_startup() */ karg.size = arg->size; karg.uminit = arg->uminit; karg.fini = arg->fini; karg.align = arg->align; karg.flags = (arg->flags & ~UMA_ZONE_SMR); karg.zone = zone; error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, flags); if (error) return (error); } /* Inherit properties from the keg. */ zone->uz_keg = keg; zone->uz_size = keg->uk_size; zone->uz_flags |= (keg->uk_flags & (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); out: if (booted >= BOOT_PCPU) { zone_alloc_counters(zone, NULL); if (booted >= BOOT_RUNNING) zone_alloc_sysctl(zone, NULL); } else { zone->uz_allocs = EARLY_COUNTER; zone->uz_frees = EARLY_COUNTER; zone->uz_fails = EARLY_COUNTER; } /* Caller requests a private SMR context. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) zone->uz_smr = smr_create(zone->uz_name, 0, 0); KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), ("Invalid zone flag combination")); if (arg->flags & UMA_ZFLAG_INTERNAL) zone->uz_bucket_size_max = zone->uz_bucket_size = 0; if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) zone->uz_bucket_size = BUCKET_MAX; else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) zone->uz_bucket_size = 0; else zone->uz_bucket_size = bucket_select(zone->uz_size); zone->uz_bucket_size_min = zone->uz_bucket_size; if (zone->uz_dtor != NULL || zone->uz_ctor != NULL) zone->uz_flags |= UMA_ZFLAG_CTORDTOR; zone_update_caches(zone); return (0); } /* * Keg header dtor. This frees all data, destroys locks, frees the hash * table and removes the keg from the global list. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void keg_dtor(void *arg, int size, void *udata) { uma_keg_t keg; uint32_t free, pages; int i; keg = (uma_keg_t)arg; free = pages = 0; for (i = 0; i < vm_ndomains; i++) { free += keg->uk_domain[i].ud_free_items; pages += keg->uk_domain[i].ud_pages; KEG_LOCK_FINI(keg, i); } if (pages != 0) printf("Freed UMA keg (%s) was not empty (%u items). " " Lost %u pages of memory.\n", keg->uk_name ? keg->uk_name : "", pages / keg->uk_ppera * keg->uk_ipers - free, pages); hash_free(&keg->uk_hash); } /* * Zone header dtor. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void zone_dtor(void *arg, int size, void *udata) { uma_zone_t zone; uma_keg_t keg; int i; zone = (uma_zone_t)arg; sysctl_remove_oid(zone->uz_oid, 1, 1); if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); rw_wlock(&uma_rwlock); LIST_REMOVE(zone, uz_link); rw_wunlock(&uma_rwlock); if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { keg = zone->uz_keg; keg->uk_reserve = 0; } zone_reclaim(zone, UMA_ANYDOMAIN, M_WAITOK, true); /* * We only destroy kegs from non secondary/non cache zones. */ if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { keg = zone->uz_keg; rw_wlock(&uma_rwlock); LIST_REMOVE(keg, uk_link); rw_wunlock(&uma_rwlock); zone_free_item(kegs, keg, NULL, SKIP_NONE); } counter_u64_free(zone->uz_allocs); counter_u64_free(zone->uz_frees); counter_u64_free(zone->uz_fails); counter_u64_free(zone->uz_xdomain); free(zone->uz_ctlname, M_UMA); for (i = 0; i < vm_ndomains; i++) ZDOM_LOCK_FINI(ZDOM_GET(zone, i)); ZONE_CROSS_LOCK_FINI(zone); } static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *arg), void *arg) { uma_keg_t keg; uma_zone_t zone; LIST_FOREACH(keg, &uma_kegs, uk_link) { LIST_FOREACH(zone, &keg->uk_zones, uz_link) zfunc(zone, arg); } LIST_FOREACH(zone, &uma_cachezones, uz_link) zfunc(zone, arg); } /* * Traverses every zone in the system and calls a callback * * Arguments: * zfunc A pointer to a function which accepts a zone * as an argument. * * Returns: * Nothing */ static void zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg) { rw_rlock(&uma_rwlock); zone_foreach_unlocked(zfunc, arg); rw_runlock(&uma_rwlock); } /* * Initialize the kernel memory allocator. This is done after pages can be * allocated but before general KVA is available. */ void uma_startup1(vm_offset_t virtual_avail) { struct uma_zctor_args args; size_t ksize, zsize, size; uma_keg_t primarykeg; uintptr_t m; int domain; uint8_t pflag; bootstart = bootmem = virtual_avail; rw_init(&uma_rwlock, "UMA lock"); sx_init(&uma_reclaim_lock, "umareclaim"); ksize = sizeof(struct uma_keg) + (sizeof(struct uma_domain) * vm_ndomains); ksize = roundup(ksize, UMA_SUPER_ALIGN); zsize = sizeof(struct uma_zone) + (sizeof(struct uma_cache) * (mp_maxid + 1)) + (sizeof(struct uma_zone_domain) * vm_ndomains); zsize = roundup(zsize, UMA_SUPER_ALIGN); /* Allocate the zone of zones, zone of kegs, and zone of zones keg. */ size = (zsize * 2) + ksize; for (domain = 0; domain < vm_ndomains; domain++) { m = (uintptr_t)startup_alloc(NULL, size, domain, &pflag, M_NOWAIT | M_ZERO); if (m != 0) break; } zones = (uma_zone_t)m; m += zsize; kegs = (uma_zone_t)m; m += zsize; primarykeg = (uma_keg_t)m; /* "manually" create the initial zone */ memset(&args, 0, sizeof(args)); args.name = "UMA Kegs"; args.size = ksize; args.ctor = keg_ctor; args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = primarykeg; args.align = UMA_SUPER_ALIGN - 1; args.flags = UMA_ZFLAG_INTERNAL; zone_ctor(kegs, zsize, &args, M_WAITOK); args.name = "UMA Zones"; args.size = zsize; args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = NULL; args.align = UMA_SUPER_ALIGN - 1; args.flags = UMA_ZFLAG_INTERNAL; zone_ctor(zones, zsize, &args, M_WAITOK); /* Now make zones for slab headers */ slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); bucket_init(); smr_init(); } #ifndef UMA_MD_SMALL_ALLOC extern void vm_radix_reserve_kva(void); #endif /* * Advertise the availability of normal kva allocations and switch to * the default back-end allocator. Marks the KVA we consumed on startup * as used in the map. */ void uma_startup2(void) { if (bootstart != bootmem) { vm_map_lock(kernel_map); (void)vm_map_insert(kernel_map, NULL, 0, bootstart, bootmem, VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); vm_map_unlock(kernel_map); } #ifndef UMA_MD_SMALL_ALLOC /* Set up radix zone to use noobj_alloc. */ vm_radix_reserve_kva(); #endif booted = BOOT_KVA; zone_foreach_unlocked(zone_kva_available, NULL); bucket_enable(); } /* * Allocate counters as early as possible so that boot-time allocations are * accounted more precisely. */ static void uma_startup_pcpu(void *arg __unused) { zone_foreach_unlocked(zone_alloc_counters, NULL); booted = BOOT_PCPU; } SYSINIT(uma_startup_pcpu, SI_SUB_COUNTER, SI_ORDER_ANY, uma_startup_pcpu, NULL); /* * Finish our initialization steps. */ static void uma_startup3(void *arg __unused) { #ifdef INVARIANTS TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor); uma_dbg_cnt = counter_u64_alloc(M_WAITOK); uma_skip_cnt = counter_u64_alloc(M_WAITOK); #endif zone_foreach_unlocked(zone_alloc_sysctl, NULL); callout_init(&uma_callout, 1); callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); booted = BOOT_RUNNING; EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); static void uma_shutdown(void) { booted = BOOT_SHUTDOWN; } static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_kctor_args args; args.size = size; args.uminit = uminit; args.fini = fini; args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; args.flags = flags; args.zone = zone; return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); } /* Public functions */ /* See uma.h */ void uma_set_align(int align) { if (align != UMA_ALIGN_CACHE) uma_align_cache = align; } /* See uma.h */ uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_zctor_args args; uma_zone_t res; KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"", align, name)); /* This stuff is essential for the zone ctor */ memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = uminit; args.fini = fini; #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN) /* * Inject procedures which check for memory use after free if we are * allowed to scramble the memory while it is not allocated. This * requires that: UMA is actually able to access the memory, no init * or fini procedures, no dependency on the initial value of the * memory, and no (legitimate) use of the memory after free. Note, * the ctor and dtor do not need to be empty. */ if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH | UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) { args.uminit = trash_init; args.fini = trash_fini; } #endif args.align = align; args.flags = flags; args.keg = NULL; sx_xlock(&uma_reclaim_lock); res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); sx_xunlock(&uma_reclaim_lock); return (res); } /* See uma.h */ uma_zone_t uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t primary) { struct uma_zctor_args args; uma_keg_t keg; uma_zone_t res; keg = primary->uz_keg; memset(&args, 0, sizeof(args)); args.name = name; args.size = keg->uk_size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.align = keg->uk_align; args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; args.keg = keg; sx_xlock(&uma_reclaim_lock); res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); sx_xunlock(&uma_reclaim_lock); return (res); } /* See uma.h */ uma_zone_t uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, void *arg, int flags) { struct uma_zctor_args args; memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.import = zimport; args.release = zrelease; args.arg = arg; args.align = 0; args.flags = flags | UMA_ZFLAG_CACHE; return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); } /* See uma.h */ void uma_zdestroy(uma_zone_t zone) { /* * Large slabs are expensive to reclaim, so don't bother doing * unnecessary work if we're shutting down. */ if (booted == BOOT_SHUTDOWN && zone->uz_fini == NULL && zone->uz_release == zone_release) return; sx_xlock(&uma_reclaim_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); sx_xunlock(&uma_reclaim_lock); } void uma_zwait(uma_zone_t zone) { if ((zone->uz_flags & UMA_ZONE_SMR) != 0) uma_zfree_smr(zone, uma_zalloc_smr(zone, M_WAITOK)); else if ((zone->uz_flags & UMA_ZONE_PCPU) != 0) uma_zfree_pcpu(zone, uma_zalloc_pcpu(zone, M_WAITOK)); else uma_zfree(zone, uma_zalloc(zone, M_WAITOK)); } void * uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags) { void *item, *pcpu_item; #ifdef SMP int i; MPASS(zone->uz_flags & UMA_ZONE_PCPU); #endif item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO); if (item == NULL) return (NULL); pcpu_item = zpcpu_base_to_offset(item); if (flags & M_ZERO) { #ifdef SMP for (i = 0; i <= mp_maxid; i++) bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size); #else bzero(item, zone->uz_size); #endif } return (pcpu_item); } /* * A stub while both regular and pcpu cases are identical. */ void uma_zfree_pcpu_arg(uma_zone_t zone, void *pcpu_item, void *udata) { void *item; #ifdef SMP MPASS(zone->uz_flags & UMA_ZONE_PCPU); #endif /* uma_zfree_pcu_*(..., NULL) does nothing, to match free(9). */ if (pcpu_item == NULL) return; item = zpcpu_offset_to_base(pcpu_item); uma_zfree_arg(zone, item, udata); } static inline void * item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags, void *item) { #ifdef INVARIANTS bool skipdbg; #endif kasan_mark_item_valid(zone, item); kmsan_mark_item_uninitialized(zone, item); #ifdef INVARIANTS skipdbg = uma_dbg_zskip(zone, item); if (!skipdbg && (uz_flags & UMA_ZFLAG_TRASH) != 0 && zone->uz_ctor != trash_ctor) trash_ctor(item, size, udata, flags); #endif /* Check flags before loading ctor pointer. */ if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) && __predict_false(zone->uz_ctor != NULL) && zone->uz_ctor(item, size, udata, flags) != 0) { counter_u64_add(zone->uz_fails, 1); zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); return (NULL); } #ifdef INVARIANTS if (!skipdbg) uma_dbg_alloc(zone, NULL, item); #endif if (__predict_false(flags & M_ZERO)) return (memset(item, 0, size)); return (item); } static inline void item_dtor(uma_zone_t zone, void *item, int size, void *udata, enum zfreeskip skip) { #ifdef INVARIANTS bool skipdbg; skipdbg = uma_dbg_zskip(zone, item); if (skip == SKIP_NONE && !skipdbg) { if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); } #endif if (__predict_true(skip < SKIP_DTOR)) { if (zone->uz_dtor != NULL) zone->uz_dtor(item, size, udata); #ifdef INVARIANTS if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && zone->uz_dtor != trash_dtor) trash_dtor(item, size, udata); #endif } kasan_mark_item_invalid(zone, item); } #ifdef NUMA static int item_domain(void *item) { int domain; domain = vm_phys_domain(vtophys(item)); KASSERT(domain >= 0 && domain < vm_ndomains, ("%s: unknown domain for item %p", __func__, item)); return (domain); } #endif #if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS) #define UMA_ZALLOC_DEBUG static int uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags) { int error; error = 0; #ifdef WITNESS if (flags & M_WAITOK) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_debug: zone \"%s\"", zone->uz_name); } #endif #ifdef INVARIANTS KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_debug: called with M_EXEC")); KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zalloc_debug: called within spinlock or critical section")); KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0, ("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO")); #endif #ifdef DEBUG_MEMGUARD if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && memguard_cmp_zone(zone)) { void *item; item = memguard_alloc(zone->uz_size, flags); if (item != NULL) { error = EJUSTRETURN; if (zone->uz_init != NULL && zone->uz_init(item, zone->uz_size, flags) != 0) { *itemp = NULL; return (error); } if (zone->uz_ctor != NULL && zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { counter_u64_add(zone->uz_fails, 1); zone->uz_fini(item, zone->uz_size); *itemp = NULL; return (error); } *itemp = item; return (error); } /* This is unfortunate but should not be fatal. */ } #endif return (error); } static int uma_zfree_debug(uma_zone_t zone, void *item, void *udata) { KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zfree_debug: called with spinlock or critical section held")); #ifdef DEBUG_MEMGUARD if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && is_memguard_addr(item)) { if (zone->uz_dtor != NULL) zone->uz_dtor(item, zone->uz_size, udata); if (zone->uz_fini != NULL) zone->uz_fini(item, zone->uz_size); memguard_free(item); return (EJUSTRETURN); } #endif return (0); } #endif static inline void * cache_alloc_item(uma_zone_t zone, uma_cache_t cache, uma_cache_bucket_t bucket, void *udata, int flags) { void *item; int size, uz_flags; item = cache_bucket_pop(cache, bucket); size = cache_uz_size(cache); uz_flags = cache_uz_flags(cache); critical_exit(); return (item_ctor(zone, uz_flags, size, udata, flags, item)); } static __noinline void * cache_alloc_retry(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) { uma_cache_bucket_t bucket; int domain; while (cache_alloc(zone, cache, udata, flags)) { cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) continue; return (cache_alloc_item(zone, cache, bucket, udata, flags)); } critical_exit(); /* * We can not get a bucket so try to return a single item. */ if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH) domain = PCPU_GET(domain); else domain = UMA_ANYDOMAIN; return (zone_alloc_item(zone, udata, domain, flags)); } /* See uma.h */ void * uma_zalloc_smr(uma_zone_t zone, int flags) { uma_cache_bucket_t bucket; uma_cache_t cache; #ifdef UMA_ZALLOC_DEBUG void *item; KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0, ("uma_zalloc_arg: called with non-SMR zone.")); if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN) return (item); #endif critical_enter(); cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) return (cache_alloc_retry(zone, cache, NULL, flags)); return (cache_alloc_item(zone, cache, bucket, NULL, flags)); } /* See uma.h */ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { uma_cache_bucket_t bucket; uma_cache_t cache; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); /* This is the fast path allocation */ CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name, zone, flags); #ifdef UMA_ZALLOC_DEBUG void *item; KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zalloc_arg: called with SMR zone.")); if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN) return (item); #endif /* * If possible, allocate from the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to allocate from * the current cache; when we re-acquire the critical section, we * must detect and handle migration if it has occurred. */ critical_enter(); cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) return (cache_alloc_retry(zone, cache, udata, flags)); return (cache_alloc_item(zone, cache, bucket, udata, flags)); } /* * Replenish an alloc bucket and possibly restore an old one. Called in * a critical section. Returns in a critical section. * * A false return value indicates an allocation failure. * A true return value indicates success and the caller should retry. */ static __noinline bool cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) { uma_bucket_t bucket; int curdomain, domain; bool new; CRITICAL_ASSERT(curthread); /* * If we have run out of items in our alloc bucket see * if we can switch with the free bucket. * * SMR Zones can't re-use the free bucket until the sequence has * expired. */ if ((cache_uz_flags(cache) & UMA_ZONE_SMR) == 0 && cache->uc_freebucket.ucb_cnt != 0) { cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket); return (true); } /* * Discard any empty allocation bucket while we hold no locks. */ bucket = cache_bucket_unload_alloc(cache); critical_exit(); if (bucket != NULL) { KASSERT(bucket->ub_cnt == 0, ("cache_alloc: Entered with non-empty alloc bucket.")); bucket_free(zone, bucket, udata); } /* * Attempt to retrieve the item from the per-CPU cache has failed, so * we must go back to the zone. This requires the zdom lock, so we * must drop the critical section, then re-acquire it when we go back * to the cache. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ domain = PCPU_GET(domain); if ((cache_uz_flags(cache) & UMA_ZONE_ROUNDROBIN) != 0 || VM_DOMAIN_EMPTY(domain)) domain = zone_domain_highest(zone, domain); bucket = cache_fetch_bucket(zone, cache, domain); if (bucket == NULL && zone->uz_bucket_size != 0 && !bucketdisable) { bucket = zone_alloc_bucket(zone, udata, domain, flags); new = true; } else { new = false; } CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); if (bucket == NULL) { critical_enter(); return (false); } /* * See if we lost the race or were migrated. Cache the * initialized bucket to make this less likely or claim * the memory directly. */ critical_enter(); cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket.ucb_bucket == NULL && ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) == 0 || (curdomain = PCPU_GET(domain)) == domain || VM_DOMAIN_EMPTY(curdomain))) { if (new) atomic_add_long(&ZDOM_GET(zone, domain)->uzd_imax, bucket->ub_cnt); cache_bucket_load_alloc(cache, bucket); return (true); } /* * We lost the race, release this bucket and start over. */ critical_exit(); zone_put_bucket(zone, domain, bucket, udata, !new); critical_enter(); return (true); } void * uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) { #ifdef NUMA uma_bucket_t bucket; uma_zone_domain_t zdom; void *item; #endif /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); /* This is the fast path allocation */ CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d", zone->uz_name, zone, domain, flags); if (flags & M_WAITOK) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_domain: zone \"%s\"", zone->uz_name); } KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zalloc_domain: called with spinlock or critical section held")); KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zalloc_domain: called with SMR zone.")); #ifdef NUMA KASSERT((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0, ("uma_zalloc_domain: called with non-FIRSTTOUCH zone.")); if (vm_ndomains == 1) return (uma_zalloc_arg(zone, udata, flags)); /* * Try to allocate from the bucket cache before falling back to the keg. * We could try harder and attempt to allocate from per-CPU caches or * the per-domain cross-domain buckets, but the complexity is probably * not worth it. It is more important that frees of previous * cross-domain allocations do not blow up the cache. */ zdom = zone_domain_lock(zone, domain); if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) { item = bucket->ub_bucket[bucket->ub_cnt - 1]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_cnt - 1] = NULL; #endif bucket->ub_cnt--; zone_put_bucket(zone, domain, bucket, udata, true); item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags, item); if (item != NULL) { KASSERT(item_domain(item) == domain, ("%s: bucket cache item %p from wrong domain", __func__, item)); counter_u64_add(zone->uz_allocs, 1); } return (item); } ZDOM_UNLOCK(zdom); return (zone_alloc_item(zone, udata, domain, flags)); #else return (uma_zalloc_arg(zone, udata, flags)); #endif } /* * Find a slab with some space. Prefer slabs that are partially used over those * that are totally full. This helps to reduce fragmentation. * * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check * only 'domain'. */ static uma_slab_t keg_first_slab(uma_keg_t keg, int domain, bool rr) { uma_domain_t dom; uma_slab_t slab; int start; KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_first_slab: domain %d out of range", domain)); KEG_LOCK_ASSERT(keg, domain); slab = NULL; start = domain; do { dom = &keg->uk_domain[domain]; if ((slab = LIST_FIRST(&dom->ud_part_slab)) != NULL) return (slab); if ((slab = LIST_FIRST(&dom->ud_free_slab)) != NULL) { LIST_REMOVE(slab, us_link); dom->ud_free_slabs--; LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); return (slab); } if (rr) domain = (domain + 1) % vm_ndomains; } while (domain != start); return (NULL); } /* * Fetch an existing slab from a free or partial list. Returns with the * keg domain lock held if a slab was found or unlocked if not. */ static uma_slab_t keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags) { uma_slab_t slab; uint32_t reserve; /* HASH has a single free list. */ if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) domain = 0; KEG_LOCK(keg, domain); reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve; if (keg->uk_domain[domain].ud_free_items <= reserve || (slab = keg_first_slab(keg, domain, rr)) == NULL) { KEG_UNLOCK(keg, domain); return (NULL); } return (slab); } static uma_slab_t keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags) { struct vm_domainset_iter di; uma_slab_t slab; int aflags, domain; bool rr; restart: /* * Use the keg's policy if upper layers haven't already specified a * domain (as happens with first-touch zones). * * To avoid races we run the iterator with the keg lock held, but that * means that we cannot allow the vm_domainset layer to sleep. Thus, * clear M_WAITOK and handle low memory conditions locally. */ rr = rdomain == UMA_ANYDOMAIN; if (rr) { aflags = (flags & ~M_WAITOK) | M_NOWAIT; vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags); } else { aflags = flags; domain = rdomain; } for (;;) { slab = keg_fetch_free_slab(keg, domain, rr, flags); if (slab != NULL) return (slab); /* * M_NOVM means don't ask at all! */ if (flags & M_NOVM) break; slab = keg_alloc_slab(keg, zone, domain, flags, aflags); if (slab != NULL) return (slab); if (!rr && (flags & M_WAITOK) == 0) break; if (rr && vm_domainset_iter_policy(&di, &domain) != 0) { if ((flags & M_WAITOK) != 0) { vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); goto restart; } break; } } /* * We might not have been able to get a slab but another cpu * could have while we were unlocked. Check again before we * fail. */ if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) return (slab); return (NULL); } static void * slab_alloc_item(uma_keg_t keg, uma_slab_t slab) { uma_domain_t dom; void *item; int freei; KEG_LOCK_ASSERT(keg, slab->us_domain); dom = &keg->uk_domain[slab->us_domain]; freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1; BIT_CLR(keg->uk_ipers, freei, &slab->us_free); item = slab_item(slab, keg, freei); slab->us_freecount--; dom->ud_free_items--; /* * Move this slab to the full list. It must be on the partial list, so * we do not need to update the free slab count. In particular, * keg_fetch_slab() always returns slabs on the partial list. */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); } return (item); } static int zone_import(void *arg, void **bucket, int max, int domain, int flags) { uma_domain_t dom; uma_zone_t zone; uma_slab_t slab; uma_keg_t keg; #ifdef NUMA int stripe; #endif int i; zone = arg; slab = NULL; keg = zone->uz_keg; /* Try to keep the buckets totally full */ for (i = 0; i < max; ) { if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL) break; #ifdef NUMA stripe = howmany(max, vm_ndomains); #endif dom = &keg->uk_domain[slab->us_domain]; do { bucket[i++] = slab_alloc_item(keg, slab); if (dom->ud_free_items <= keg->uk_reserve) { /* * Avoid depleting the reserve after a * successful item allocation, even if * M_USE_RESERVE is specified. */ KEG_UNLOCK(keg, slab->us_domain); goto out; } #ifdef NUMA /* * If the zone is striped we pick a new slab for every * N allocations. Eliminating this conditional will * instead pick a new domain for each bucket rather * than stripe within each bucket. The current option * produces more fragmentation and requires more cpu * time but yields better distribution. */ if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 && vm_ndomains > 1 && --stripe == 0) break; #endif } while (slab->us_freecount != 0 && i < max); KEG_UNLOCK(keg, slab->us_domain); /* Don't block if we allocated any successfully. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } out: return i; } static int zone_alloc_limit_hard(uma_zone_t zone, int count, int flags) { uint64_t old, new, total, max; /* * The hard case. We're going to sleep because there were existing * sleepers or because we ran out of items. This routine enforces * fairness by keeping fifo order. * * First release our ill gotten gains and make some noise. */ for (;;) { zone_free_limit(zone, count); zone_log_warning(zone); zone_maxaction(zone); if (flags & M_NOWAIT) return (0); /* * We need to allocate an item or set ourself as a sleeper * while the sleepq lock is held to avoid wakeup races. This * is essentially a home rolled semaphore. */ sleepq_lock(&zone->uz_max_items); old = zone->uz_items; do { MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX); /* Cache the max since we will evaluate twice. */ max = zone->uz_max_items; if (UZ_ITEMS_SLEEPERS(old) != 0 || UZ_ITEMS_COUNT(old) >= max) new = old + UZ_ITEMS_SLEEPER; else new = old + MIN(count, max - old); } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0); /* We may have successfully allocated under the sleepq lock. */ if (UZ_ITEMS_SLEEPERS(new) == 0) { sleepq_release(&zone->uz_max_items); return (new - old); } /* * This is in a different cacheline from uz_items so that we * don't constantly invalidate the fastpath cacheline when we * adjust item counts. This could be limited to toggling on * transitions. */ atomic_add_32(&zone->uz_sleepers, 1); atomic_add_64(&zone->uz_sleeps, 1); /* * We have added ourselves as a sleeper. The sleepq lock * protects us from wakeup races. Sleep now and then retry. */ sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0); sleepq_wait(&zone->uz_max_items, PVM); /* * After wakeup, remove ourselves as a sleeper and try * again. We no longer have the sleepq lock for protection. * * Subract ourselves as a sleeper while attempting to add * our count. */ atomic_subtract_32(&zone->uz_sleepers, 1); old = atomic_fetchadd_64(&zone->uz_items, -(UZ_ITEMS_SLEEPER - count)); /* We're no longer a sleeper. */ old -= UZ_ITEMS_SLEEPER; /* * If we're still at the limit, restart. Notably do not * block on other sleepers. Cache the max value to protect * against changes via sysctl. */ total = UZ_ITEMS_COUNT(old); max = zone->uz_max_items; if (total >= max) continue; /* Truncate if necessary, otherwise wake other sleepers. */ if (total + count > max) { zone_free_limit(zone, total + count - max); count = max - total; } else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0) wakeup_one(&zone->uz_max_items); return (count); } } /* * Allocate 'count' items from our max_items limit. Returns the number * available. If M_NOWAIT is not specified it will sleep until at least * one item can be allocated. */ static int zone_alloc_limit(uma_zone_t zone, int count, int flags) { uint64_t old; uint64_t max; max = zone->uz_max_items; MPASS(max > 0); /* * We expect normal allocations to succeed with a simple * fetchadd. */ old = atomic_fetchadd_64(&zone->uz_items, count); if (__predict_true(old + count <= max)) return (count); /* * If we had some items and no sleepers just return the * truncated value. We have to release the excess space * though because that may wake sleepers who weren't woken * because we were temporarily over the limit. */ if (old < max) { zone_free_limit(zone, (old + count) - max); return (max - old); } return (zone_alloc_limit_hard(zone, count, flags)); } /* * Free a number of items back to the limit. */ static void zone_free_limit(uma_zone_t zone, int count) { uint64_t old; MPASS(count > 0); /* * In the common case we either have no sleepers or * are still over the limit and can just return. */ old = atomic_fetchadd_64(&zone->uz_items, -count); if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 || UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items)) return; /* * Moderate the rate of wakeups. Sleepers will continue * to generate wakeups if necessary. */ wakeup_one(&zone->uz_max_items); } static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) { uma_bucket_t bucket; int error, maxbucket, cnt; CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name, zone, domain); /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; else if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0) domain = UMA_ANYDOMAIN; if (zone->uz_max_items > 0) maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size, M_NOWAIT); else maxbucket = zone->uz_bucket_size; if (maxbucket == 0) return (false); /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); if (bucket == NULL) { cnt = 0; goto out; } bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, MIN(maxbucket, bucket->ub_entries), domain, flags); /* * Initialize the memory if necessary. */ if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { int i; for (i = 0; i < bucket->ub_cnt; i++) { kasan_mark_item_valid(zone, bucket->ub_bucket[i]); error = zone->uz_init(bucket->ub_bucket[i], zone->uz_size, flags); kasan_mark_item_invalid(zone, bucket->ub_bucket[i]); if (error != 0) break; } /* * If we couldn't initialize the whole bucket, put the * rest back onto the freelist. */ if (i != bucket->ub_cnt) { zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], bucket->ub_cnt - i); #ifdef INVARIANTS bzero(&bucket->ub_bucket[i], sizeof(void *) * (bucket->ub_cnt - i)); #endif bucket->ub_cnt = i; } } cnt = bucket->ub_cnt; if (bucket->ub_cnt == 0) { bucket_free(zone, bucket, udata); counter_u64_add(zone->uz_fails, 1); bucket = NULL; } out: if (zone->uz_max_items > 0 && cnt < maxbucket) zone_free_limit(zone, maxbucket - cnt); return (bucket); } /* * Allocates a single item from a zone. * * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. * domain The domain to allocate from or UMA_ANYDOMAIN. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns * NULL if there is no memory and M_NOWAIT is set * An item if successful */ static void * zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) { void *item; if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) { counter_u64_add(zone->uz_fails, 1); return (NULL); } /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail_cnt; /* * We have to call both the zone's init (not the keg's init) * and the zone's ctor. This is because the item is going from * a keg slab directly to the user, and the user is expecting it * to be both zone-init'd as well as zone-ctor'd. */ if (zone->uz_init != NULL) { int error; kasan_mark_item_valid(zone, item); error = zone->uz_init(item, zone->uz_size, flags); kasan_mark_item_invalid(zone, item); if (error != 0) { zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT); goto fail_cnt; } } item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags, item); if (item == NULL) goto fail; counter_u64_add(zone->uz_allocs, 1); CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, zone->uz_name, zone); return (item); fail_cnt: counter_u64_add(zone->uz_fails, 1); fail: if (zone->uz_max_items > 0) zone_free_limit(zone, 1); CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", zone->uz_name, zone); return (NULL); } /* See uma.h */ void uma_zfree_smr(uma_zone_t zone, void *item) { uma_cache_t cache; uma_cache_bucket_t bucket; int itemdomain, uz_flags; #ifdef UMA_ZALLOC_DEBUG KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0, ("uma_zfree_smr: called with non-SMR zone.")); KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer.")); SMR_ASSERT_NOT_ENTERED(zone->uz_smr); if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN) return; #endif cache = &zone->uz_cpu[curcpu]; uz_flags = cache_uz_flags(cache); itemdomain = 0; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) itemdomain = item_domain(item); #endif critical_enter(); do { cache = &zone->uz_cpu[curcpu]; /* SMR Zones must free to the free bucket. */ bucket = &cache->uc_freebucket; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && PCPU_GET(domain) != itemdomain) { bucket = &cache->uc_crossbucket; } #endif if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { cache_bucket_push(cache, bucket, item); critical_exit(); return; } } while (cache_free(zone, cache, NULL, item, itemdomain)); critical_exit(); /* * If nothing else caught this, we'll just do an internal free. */ zone_free_item(zone, item, NULL, SKIP_NONE); } /* See uma.h */ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_cache_t cache; uma_cache_bucket_t bucket; int itemdomain, uz_flags; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); CTR2(KTR_UMA, "uma_zfree_arg zone %s(%p)", zone->uz_name, zone); #ifdef UMA_ZALLOC_DEBUG KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zfree_arg: called with SMR zone.")); if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN) return; #endif /* uma_zfree(..., NULL) does nothing, to match free(9). */ if (item == NULL) return; /* * We are accessing the per-cpu cache without a critical section to * fetch size and flags. This is acceptable, if we are preempted we * will simply read another cpu's line. */ cache = &zone->uz_cpu[curcpu]; uz_flags = cache_uz_flags(cache); if (UMA_ALWAYS_CTORDTOR || __predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0)) item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE); /* * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) { if (atomic_load_32(&zone->uz_sleepers) > 0) goto zfree_item; } /* * If possible, free to the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to free to the * current cache; when we re-acquire the critical section, we must * detect and handle migration if it has occurred. */ itemdomain = 0; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) itemdomain = item_domain(item); #endif critical_enter(); do { cache = &zone->uz_cpu[curcpu]; /* * Try to free into the allocbucket first to give LIFO * ordering for cache-hot datastructures. Spill over * into the freebucket if necessary. Alloc will swap * them if one runs dry. */ bucket = &cache->uc_allocbucket; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && PCPU_GET(domain) != itemdomain) { bucket = &cache->uc_crossbucket; } else #endif if (bucket->ucb_cnt == bucket->ucb_entries && cache->uc_freebucket.ucb_cnt < cache->uc_freebucket.ucb_entries) cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket); if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { cache_bucket_push(cache, bucket, item); critical_exit(); return; } } while (cache_free(zone, cache, udata, item, itemdomain)); critical_exit(); /* * If nothing else caught this, we'll just do an internal free. */ zfree_item: zone_free_item(zone, item, udata, SKIP_DTOR); } #ifdef NUMA /* * sort crossdomain free buckets to domain correct buckets and cache * them. */ static void zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata) { struct uma_bucketlist emptybuckets, fullbuckets; uma_zone_domain_t zdom; uma_bucket_t b; smr_seq_t seq; void *item; int domain; CTR3(KTR_UMA, "uma_zfree: zone %s(%p) draining cross bucket %p", zone->uz_name, zone, bucket); /* * It is possible for buckets to arrive here out of order so we fetch * the current smr seq rather than accepting the bucket's. */ seq = SMR_SEQ_INVALID; if ((zone->uz_flags & UMA_ZONE_SMR) != 0) seq = smr_advance(zone->uz_smr); /* * To avoid having ndomain * ndomain buckets for sorting we have a * lock on the current crossfree bucket. A full matrix with * per-domain locking could be used if necessary. */ STAILQ_INIT(&emptybuckets); STAILQ_INIT(&fullbuckets); ZONE_CROSS_LOCK(zone); for (; bucket->ub_cnt > 0; bucket->ub_cnt--) { item = bucket->ub_bucket[bucket->ub_cnt - 1]; domain = item_domain(item); zdom = ZDOM_GET(zone, domain); if (zdom->uzd_cross == NULL) { if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) { STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); zdom->uzd_cross = b; } else { /* * Avoid allocating a bucket with the cross lock * held, since allocation can trigger a * cross-domain free and bucket zones may * allocate from each other. */ ZONE_CROSS_UNLOCK(zone); b = bucket_alloc(zone, udata, M_NOWAIT); if (b == NULL) goto out; ZONE_CROSS_LOCK(zone); if (zdom->uzd_cross != NULL) { STAILQ_INSERT_HEAD(&emptybuckets, b, ub_link); } else { zdom->uzd_cross = b; } } } b = zdom->uzd_cross; b->ub_bucket[b->ub_cnt++] = item; b->ub_seq = seq; if (b->ub_cnt == b->ub_entries) { STAILQ_INSERT_HEAD(&fullbuckets, b, ub_link); if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); zdom->uzd_cross = b; } } ZONE_CROSS_UNLOCK(zone); out: if (bucket->ub_cnt == 0) bucket->ub_seq = SMR_SEQ_INVALID; bucket_free(zone, bucket, udata); while ((b = STAILQ_FIRST(&emptybuckets)) != NULL) { STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); bucket_free(zone, b, udata); } while ((b = STAILQ_FIRST(&fullbuckets)) != NULL) { STAILQ_REMOVE_HEAD(&fullbuckets, ub_link); domain = item_domain(b->ub_bucket[0]); zone_put_bucket(zone, domain, b, udata, true); } } #endif static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, int itemdomain, bool ws) { #ifdef NUMA /* * Buckets coming from the wrong domain will be entirely for the * only other domain on two domain systems. In this case we can * simply cache them. Otherwise we need to sort them back to * correct domains. */ if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 2 && PCPU_GET(domain) != itemdomain) { zone_free_cross(zone, bucket, udata); return; } #endif /* * Attempt to save the bucket in the zone's domain bucket cache. */ CTR3(KTR_UMA, "uma_zfree: zone %s(%p) putting bucket %p on free list", zone->uz_name, zone, bucket); /* ub_cnt is pointing to the last free item */ if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0) itemdomain = zone_domain_lowest(zone, itemdomain); zone_put_bucket(zone, itemdomain, bucket, udata, ws); } /* * Populate a free or cross bucket for the current cpu cache. Free any * existing full bucket either to the zone cache or back to the slab layer. * * Enters and returns in a critical section. false return indicates that * we can not satisfy this free in the cache layer. true indicates that * the caller should retry. */ static __noinline bool cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item, int itemdomain) { uma_cache_bucket_t cbucket; uma_bucket_t newbucket, bucket; CRITICAL_ASSERT(curthread); if (zone->uz_bucket_size == 0) return false; cache = &zone->uz_cpu[curcpu]; newbucket = NULL; /* * FIRSTTOUCH domains need to free to the correct zdom. When * enabled this is the zdom of the item. The bucket is the * cross bucket if the current domain and itemdomain do not match. */ cbucket = &cache->uc_freebucket; #ifdef NUMA if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) { if (PCPU_GET(domain) != itemdomain) { cbucket = &cache->uc_crossbucket; if (cbucket->ucb_cnt != 0) counter_u64_add(zone->uz_xdomain, cbucket->ucb_cnt); } } #endif bucket = cache_bucket_unload(cbucket); KASSERT(bucket == NULL || bucket->ub_cnt == bucket->ub_entries, ("cache_free: Entered with non-full free bucket.")); /* We are no longer associated with this CPU. */ critical_exit(); /* * Don't let SMR zones operate without a free bucket. Force * a synchronize and re-use this one. We will only degrade * to a synchronize every bucket_size items rather than every * item if we fail to allocate a bucket. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) { if (bucket != NULL) bucket->ub_seq = smr_advance(zone->uz_smr); newbucket = bucket_alloc(zone, udata, M_NOWAIT); if (newbucket == NULL && bucket != NULL) { bucket_drain(zone, bucket); newbucket = bucket; bucket = NULL; } } else if (!bucketdisable) newbucket = bucket_alloc(zone, udata, M_NOWAIT); if (bucket != NULL) zone_free_bucket(zone, bucket, udata, itemdomain, true); critical_enter(); if ((bucket = newbucket) == NULL) return (false); cache = &zone->uz_cpu[curcpu]; #ifdef NUMA /* * Check to see if we should be populating the cross bucket. If it * is already populated we will fall through and attempt to populate * the free bucket. */ if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) { if (PCPU_GET(domain) != itemdomain && cache->uc_crossbucket.ucb_bucket == NULL) { cache_bucket_load_cross(cache, bucket); return (true); } } #endif /* * We may have lost the race to fill the bucket or switched CPUs. */ if (cache->uc_freebucket.ucb_bucket != NULL) { critical_exit(); bucket_free(zone, bucket, udata); critical_enter(); } else cache_bucket_load_free(cache, bucket); return (true); } static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; uma_domain_t dom; int freei; keg = zone->uz_keg; KEG_LOCK_ASSERT(keg, slab->us_domain); /* Do we need to remove from any lists? */ dom = &keg->uk_domain[slab->us_domain]; if (slab->us_freecount + 1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); dom->ud_free_slabs++; } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); } /* Slab management. */ freei = slab_item_index(slab, keg, item); BIT_SET(keg->uk_ipers, freei, &slab->us_free); slab->us_freecount++; /* Keg statistics. */ dom->ud_free_items++; } static void zone_release(void *arg, void **bucket, int cnt) { struct mtx *lock; uma_zone_t zone; uma_slab_t slab; uma_keg_t keg; uint8_t *mem; void *item; int i; zone = arg; keg = zone->uz_keg; lock = NULL; if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0)) lock = KEG_LOCK(keg, 0); for (i = 0; i < cnt; i++) { item = bucket[i]; if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) { slab = vtoslab((vm_offset_t)item); } else { mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0) slab = hash_sfind(&keg->uk_hash, mem); else slab = (uma_slab_t)(mem + keg->uk_pgoff); } if (lock != KEG_LOCKPTR(keg, slab->us_domain)) { if (lock != NULL) mtx_unlock(lock); lock = KEG_LOCK(keg, slab->us_domain); } slab_free_item(zone, slab, item); } if (lock != NULL) mtx_unlock(lock); } /* * Frees a single item to any zone. * * Arguments: * zone The zone to free to * item The item we're freeing * udata User supplied data for the dtor * skip Skip dtors and finis */ static __noinline void zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) { /* * If a free is sent directly to an SMR zone we have to * synchronize immediately because the item can instantly * be reallocated. This should only happen in degenerate * cases when no memory is available for per-cpu caches. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE) smr_synchronize(zone->uz_smr); item_dtor(zone, item, zone->uz_size, udata, skip); if (skip < SKIP_FINI && zone->uz_fini) { kasan_mark_item_valid(zone, item); zone->uz_fini(item, zone->uz_size); kasan_mark_item_invalid(zone, item); } zone->uz_release(zone->uz_arg, &item, 1); if (skip & SKIP_CNT) return; counter_u64_add(zone->uz_frees, 1); if (zone->uz_max_items > 0) zone_free_limit(zone, 1); } /* See uma.h */ int uma_zone_set_max(uma_zone_t zone, int nitems) { /* * If the limit is small, we may need to constrain the maximum per-CPU * cache size, or disable caching entirely. */ uma_zone_set_maxcache(zone, nitems); /* * XXX This can misbehave if the zone has any allocations with * no limit and a limit is imposed. There is currently no * way to clear a limit. */ ZONE_LOCK(zone); zone->uz_max_items = nitems; zone->uz_flags |= UMA_ZFLAG_LIMIT; zone_update_caches(zone); /* We may need to wake waiters. */ wakeup(&zone->uz_max_items); ZONE_UNLOCK(zone); return (nitems); } /* See uma.h */ void uma_zone_set_maxcache(uma_zone_t zone, int nitems) { int bpcpu, bpdom, bsize, nb; ZONE_LOCK(zone); /* * Compute a lower bound on the number of items that may be cached in * the zone. Each CPU gets at least two buckets, and for cross-domain * frees we use an additional bucket per CPU and per domain. Select the * largest bucket size that does not exceed half of the requested limit, * with the left over space given to the full bucket cache. */ bpdom = 0; bpcpu = 2; #ifdef NUMA if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 1) { bpcpu++; bpdom++; } #endif nb = bpcpu * mp_ncpus + bpdom * vm_ndomains; bsize = nitems / nb / 2; if (bsize > BUCKET_MAX) bsize = BUCKET_MAX; else if (bsize == 0 && nitems / nb > 0) bsize = 1; zone->uz_bucket_size_max = zone->uz_bucket_size = bsize; if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) zone->uz_bucket_size_min = zone->uz_bucket_size_max; zone->uz_bucket_max = nitems - nb * bsize; ZONE_UNLOCK(zone); } /* See uma.h */ int uma_zone_get_max(uma_zone_t zone) { int nitems; nitems = atomic_load_64(&zone->uz_max_items); return (nitems); } /* See uma.h */ void uma_zone_set_warning(uma_zone_t zone, const char *warning) { ZONE_ASSERT_COLD(zone); zone->uz_warning = warning; } /* See uma.h */ void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) { ZONE_ASSERT_COLD(zone); TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); } /* See uma.h */ int uma_zone_get_cur(uma_zone_t zone) { int64_t nitems; u_int i; nitems = 0; if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_allocs) - counter_u64_fetch(zone->uz_frees); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) - atomic_load_64(&zone->uz_cpu[i].uc_frees); return (nitems < 0 ? 0 : nitems); } static uint64_t uma_zone_get_allocs(uma_zone_t zone) { uint64_t nitems; u_int i; nitems = 0; if (zone->uz_allocs != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_allocs); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs); return (nitems); } static uint64_t uma_zone_get_frees(uma_zone_t zone) { uint64_t nitems; u_int i; nitems = 0; if (zone->uz_frees != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_frees); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees); return (nitems); } #ifdef INVARIANTS /* Used only for KEG_ASSERT_COLD(). */ static uint64_t uma_keg_get_allocs(uma_keg_t keg) { uma_zone_t z; uint64_t nitems; nitems = 0; LIST_FOREACH(z, &keg->uk_zones, uz_link) nitems += uma_zone_get_allocs(z); return (nitems); } #endif /* See uma.h */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_init = uminit; } /* See uma.h */ void uma_zone_set_fini(uma_zone_t zone, uma_fini fini) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_fini = fini; } /* See uma.h */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) { ZONE_ASSERT_COLD(zone); zone->uz_init = zinit; } /* See uma.h */ void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) { ZONE_ASSERT_COLD(zone); zone->uz_fini = zfini; } /* See uma.h */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_freef = freef; } /* See uma.h */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_allocf = allocf; } /* See uma.h */ void uma_zone_set_smr(uma_zone_t zone, smr_t smr) { ZONE_ASSERT_COLD(zone); KASSERT(smr != NULL, ("Got NULL smr")); KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("zone %p (%s) already uses SMR", zone, zone->uz_name)); zone->uz_flags |= UMA_ZONE_SMR; zone->uz_smr = smr; zone_update_caches(zone); } smr_t uma_zone_get_smr(uma_zone_t zone) { return (zone->uz_smr); } /* See uma.h */ void uma_zone_reserve(uma_zone_t zone, int items) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_reserve = items; } /* See uma.h */ int uma_zone_reserve_kva(uma_zone_t zone, int count) { uma_keg_t keg; vm_offset_t kva; u_int pages; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); ZONE_ASSERT_COLD(zone); pages = howmany(count, keg->uk_ipers) * keg->uk_ppera; #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera > 1) { #else if (1) { #endif kva = kva_alloc((vm_size_t)pages * PAGE_SIZE); if (kva == 0) return (0); } else kva = 0; MPASS(keg->uk_kva == 0); keg->uk_kva = kva; keg->uk_offset = 0; zone->uz_max_items = pages * keg->uk_ipers; #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; #else keg->uk_allocf = noobj_alloc; #endif keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; zone_update_caches(zone); return (1); } /* See uma.h */ void uma_prealloc(uma_zone_t zone, int items) { struct vm_domainset_iter di; uma_domain_t dom; uma_slab_t slab; uma_keg_t keg; int aflags, domain, slabs; KEG_GET(zone, keg); slabs = howmany(items, keg->uk_ipers); while (slabs-- > 0) { aflags = M_NOWAIT; vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags); for (;;) { slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, aflags); if (slab != NULL) { dom = &keg->uk_domain[slab->us_domain]; /* * keg_alloc_slab() always returns a slab on the * partial list. */ LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); dom->ud_free_slabs++; KEG_UNLOCK(keg, slab->us_domain); break; } if (vm_domainset_iter_policy(&di, &domain) != 0) vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); } } } /* * Returns a snapshot of memory consumption in bytes. */ size_t uma_zone_memory(uma_zone_t zone) { size_t sz; int i; sz = 0; if (zone->uz_flags & UMA_ZFLAG_CACHE) { for (i = 0; i < vm_ndomains; i++) sz += ZDOM_GET(zone, i)->uzd_nitems; return (sz * zone->uz_size); } for (i = 0; i < vm_ndomains; i++) sz += zone->uz_keg->uk_domain[i].ud_pages; return (sz * PAGE_SIZE); } /* See uma.h */ void uma_reclaim(int req) { uma_reclaim_domain(req, UMA_ANYDOMAIN); } void uma_reclaim_domain(int req, int domain) { void *arg; bucket_enable(); arg = (void *)(uintptr_t)domain; sx_slock(&uma_reclaim_lock); switch (req) { case UMA_RECLAIM_TRIM: zone_foreach(zone_trim, arg); break; case UMA_RECLAIM_DRAIN: zone_foreach(zone_drain, arg); break; case UMA_RECLAIM_DRAIN_CPU: zone_foreach(zone_drain, arg); pcpu_cache_drain_safe(NULL); zone_foreach(zone_drain, arg); break; default: panic("unhandled reclamation request %d", req); } /* * Some slabs may have been freed but this zone will be visited early * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ zone_drain(slabzones[0], arg); zone_drain(slabzones[1], arg); bucket_zone_drain(domain); sx_sunlock(&uma_reclaim_lock); } static volatile int uma_reclaim_needed; void uma_reclaim_wakeup(void) { if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0) wakeup(uma_reclaim); } void uma_reclaim_worker(void *arg __unused) { for (;;) { sx_xlock(&uma_reclaim_lock); while (atomic_load_int(&uma_reclaim_needed) == 0) sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", hz); sx_xunlock(&uma_reclaim_lock); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); uma_reclaim(UMA_RECLAIM_DRAIN_CPU); atomic_store_int(&uma_reclaim_needed, 0); /* Don't fire more than once per-second. */ pause("umarclslp", hz); } } /* See uma.h */ void uma_zone_reclaim(uma_zone_t zone, int req) { uma_zone_reclaim_domain(zone, req, UMA_ANYDOMAIN); } void uma_zone_reclaim_domain(uma_zone_t zone, int req, int domain) { void *arg; arg = (void *)(uintptr_t)domain; switch (req) { case UMA_RECLAIM_TRIM: zone_trim(zone, arg); break; case UMA_RECLAIM_DRAIN: zone_drain(zone, arg); break; case UMA_RECLAIM_DRAIN_CPU: pcpu_cache_drain_safe(zone); zone_drain(zone, arg); break; default: panic("unhandled reclamation request %d", req); } } /* See uma.h */ int uma_zone_exhausted(uma_zone_t zone) { return (atomic_load_32(&zone->uz_sleepers) > 0); } unsigned long uma_limit(void) { return (uma_kmem_limit); } void uma_set_limit(unsigned long limit) { uma_kmem_limit = limit; } unsigned long uma_size(void) { return (atomic_load_long(&uma_kmem_total)); } long uma_avail(void) { return (uma_kmem_limit - uma_size()); } #ifdef DDB /* * Generate statistics across both the zone and its per-cpu cache's. Return * desired statistics if the pointer is non-NULL for that statistic. * * Note: does not update the zone statistics, as it can't safely clear the * per-CPU cache statistic. * */ static void uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp) { uma_cache_t cache; uint64_t allocs, frees, sleeps, xdomain; int cachefree, cpu; allocs = frees = sleeps = xdomain = 0; cachefree = 0; CPU_FOREACH(cpu) { cache = &z->uz_cpu[cpu]; cachefree += cache->uc_allocbucket.ucb_cnt; cachefree += cache->uc_freebucket.ucb_cnt; xdomain += cache->uc_crossbucket.ucb_cnt; cachefree += cache->uc_crossbucket.ucb_cnt; allocs += cache->uc_allocs; frees += cache->uc_frees; } allocs += counter_u64_fetch(z->uz_allocs); frees += counter_u64_fetch(z->uz_frees); xdomain += counter_u64_fetch(z->uz_xdomain); sleeps += z->uz_sleeps; if (cachefreep != NULL) *cachefreep = cachefree; if (allocsp != NULL) *allocsp = allocs; if (freesp != NULL) *freesp = frees; if (sleepsp != NULL) *sleepsp = sleeps; if (xdomainp != NULL) *xdomainp = xdomain; } #endif /* DDB */ static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) { uma_keg_t kz; uma_zone_t z; int count; count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } LIST_FOREACH(z, &uma_cachezones, uz_link) count++; rw_runlock(&uma_rwlock); return (sysctl_handle_int(oidp, &count, 0, req)); } static void uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf, struct uma_percpu_stat *ups, bool internal) { uma_zone_domain_t zdom; uma_cache_t cache; int i; for (i = 0; i < vm_ndomains; i++) { zdom = ZDOM_GET(z, i); uth->uth_zone_free += zdom->uzd_nitems; } uth->uth_allocs = counter_u64_fetch(z->uz_allocs); uth->uth_frees = counter_u64_fetch(z->uz_frees); uth->uth_fails = counter_u64_fetch(z->uz_fails); uth->uth_xdomain = counter_u64_fetch(z->uz_xdomain); uth->uth_sleeps = z->uz_sleeps; for (i = 0; i < mp_maxid + 1; i++) { bzero(&ups[i], sizeof(*ups)); if (internal || CPU_ABSENT(i)) continue; cache = &z->uz_cpu[i]; ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt; ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt; ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt; ups[i].ups_allocs = cache->uc_allocs; ups[i].ups_frees = cache->uc_frees; } } static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) { struct uma_stream_header ush; struct uma_type_header uth; struct uma_percpu_stat *ups; struct sbuf sbuf; uma_keg_t kz; uma_zone_t z; uint64_t items; uint32_t kfree, pages; int count, error, i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK); count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } LIST_FOREACH(z, &uma_cachezones, uz_link) count++; /* * Insert stream header. */ bzero(&ush, sizeof(ush)); ush.ush_version = UMA_STREAM_VERSION; ush.ush_maxcpus = (mp_maxid + 1); ush.ush_count = count; (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); LIST_FOREACH(kz, &uma_kegs, uk_link) { kfree = pages = 0; for (i = 0; i < vm_ndomains; i++) { kfree += kz->uk_domain[i].ud_free_items; pages += kz->uk_domain[i].ud_pages; } LIST_FOREACH(z, &kz->uk_zones, uz_link) { bzero(&uth, sizeof(uth)); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_align = kz->uk_align; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; if (z->uz_max_items > 0) { items = UZ_ITEMS_COUNT(z->uz_items); uth.uth_pages = (items / kz->uk_ipers) * kz->uk_ppera; } else uth.uth_pages = pages; uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * kz->uk_ppera; uth.uth_limit = z->uz_max_items; uth.uth_keg_free = kfree; /* * A zone is secondary is it is not the first entry * on the keg's zone list. */ if ((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; uma_vm_zone_stats(&uth, z, &sbuf, ups, kz->uk_flags & UMA_ZFLAG_INTERNAL); (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); for (i = 0; i < mp_maxid + 1; i++) (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); } } LIST_FOREACH(z, &uma_cachezones, uz_link) { bzero(&uth, sizeof(uth)); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_size = z->uz_size; uma_vm_zone_stats(&uth, z, &sbuf, ups, false); (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); for (i = 0; i < mp_maxid + 1; i++) (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); } rw_runlock(&uma_rwlock); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); free(ups, M_TEMP); return (error); } int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = *(uma_zone_t *)arg1; int error, max; max = uma_zone_get_max(zone); error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); uma_zone_set_max(zone, max); return (0); } int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) { uma_zone_t zone; int cur; /* * Some callers want to add sysctls for global zones that * may not yet exist so they pass a pointer to a pointer. */ if (arg2 == 0) zone = *(uma_zone_t *)arg1; else zone = arg1; cur = uma_zone_get_cur(zone); return (sysctl_handle_int(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = uma_zone_get_allocs(zone); return (sysctl_handle_64(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = uma_zone_get_frees(zone); return (sysctl_handle_64(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; uma_zone_t zone = arg1; int error; sbuf_new_for_sysctl(&sbuf, NULL, 0, req); if (zone->uz_flags != 0) sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS); else sbuf_printf(&sbuf, "0"); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS) { uma_keg_t keg = arg1; int avail, effpct, total; total = keg->uk_ppera * PAGE_SIZE; if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0) total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize; /* * We consider the client's requested size and alignment here, not the * real size determination uk_rsize, because we also adjust the real * size for internal implementation reasons (max bitset size). */ avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1); if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) avail *= mp_maxid + 1; effpct = 100 * avail / total; return (sysctl_handle_int(oidp, &effpct, 0, req)); } static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items)); return (sysctl_handle_64(oidp, &cur, 0, req)); } #ifdef INVARIANTS static uma_slab_t uma_dbg_getslab(uma_zone_t zone, void *item) { uma_slab_t slab; uma_keg_t keg; uint8_t *mem; /* * It is safe to return the slab here even though the * zone is unlocked because the item's allocation state * essentially holds a reference. */ mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return (NULL); if (zone->uz_flags & UMA_ZFLAG_VTOSLAB) return (vtoslab((vm_offset_t)mem)); keg = zone->uz_keg; if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0) return ((uma_slab_t)(mem + keg->uk_pgoff)); KEG_LOCK(keg, 0); slab = hash_sfind(&keg->uk_hash, mem); KEG_UNLOCK(keg, 0); return (slab); } static bool uma_dbg_zskip(uma_zone_t zone, void *mem) { if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return (true); return (uma_dbg_kskip(zone->uz_keg, mem)); } static bool uma_dbg_kskip(uma_keg_t keg, void *mem) { uintptr_t idx; if (dbg_divisor == 0) return (true); if (dbg_divisor == 1) return (false); idx = (uintptr_t)mem >> PAGE_SHIFT; if (keg->uk_ipers > 1) { idx *= keg->uk_ipers; idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize; } if ((idx / dbg_divisor) * dbg_divisor != idx) { counter_u64_add(uma_skip_cnt, 1); return (true); } counter_u64_add(uma_dbg_cnt, 1); return (false); } /* * Set up the slab's freei data such that uma_dbg_free can function. * */ static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; int freei; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) panic("uma: item %p did not belong to zone %s", item, zone->uz_name); } keg = zone->uz_keg; freei = slab_item_index(slab, keg, item); if (BIT_TEST_SET_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); } /* * Verifies freed addresses. Checks for alignment, valid slab membership * and duplicate frees. * */ static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; int freei; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) panic("uma: Freed item %p did not belong to zone %s", item, zone->uz_name); } keg = zone->uz_keg; freei = slab_item_index(slab, keg, item); if (freei >= keg->uk_ipers) panic("Invalid free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); if (slab_item(slab, keg, freei) != item) panic("Unaligned free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); if (!BIT_TEST_CLR_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) panic("Duplicate free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); } #endif /* INVARIANTS */ #ifdef DDB static int64_t get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used, uint64_t *sleeps, long *cachefree, uint64_t *xdomain) { uint64_t frees; int i; if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { *allocs = counter_u64_fetch(z->uz_allocs); frees = counter_u64_fetch(z->uz_frees); *sleeps = z->uz_sleeps; *cachefree = 0; *xdomain = 0; } else uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps, xdomain); for (i = 0; i < vm_ndomains; i++) { *cachefree += ZDOM_GET(z, i)->uzd_nitems; if (!((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) *cachefree += kz->uk_domain[i].ud_free_items; } *used = *allocs - frees; return (((int64_t)*used + *cachefree) * kz->uk_size); } DB_SHOW_COMMAND(uma, db_show_uma) { const char *fmt_hdr, *fmt_entry; uma_keg_t kz; uma_zone_t z; uint64_t allocs, used, sleeps, xdomain; long cachefree; /* variables for sorting */ uma_keg_t cur_keg; uma_zone_t cur_zone, last_zone; int64_t cur_size, last_size, size; int ties; /* /i option produces machine-parseable CSV output */ if (modif[0] == 'i') { fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n"; fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n"; } else { fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n"; fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n"; } db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests", "Sleeps", "Bucket", "Total Mem", "XFree"); /* Sort the zones with largest size first. */ last_zone = NULL; last_size = INT64_MAX; for (;;) { cur_zone = NULL; cur_size = -1; ties = 0; LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { /* * In the case of size ties, print out zones * in the order they are encountered. That is, * when we encounter the most recently output * zone, we have already printed all preceding * ties, and we must print all following ties. */ if (z == last_zone) { ties = 1; continue; } size = get_uma_stats(kz, z, &allocs, &used, &sleeps, &cachefree, &xdomain); if (size > cur_size && size < last_size + ties) { cur_size = size; cur_zone = z; cur_keg = kz; } } } if (cur_zone == NULL) break; size = get_uma_stats(cur_keg, cur_zone, &allocs, &used, &sleeps, &cachefree, &xdomain); db_printf(fmt_entry, cur_zone->uz_name, (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree, (uintmax_t)allocs, (uintmax_t)sleeps, (unsigned)cur_zone->uz_bucket_size, (intmax_t)size, xdomain); if (db_pager_quit) return; last_zone = cur_zone; last_size = cur_size; } } DB_SHOW_COMMAND(umacache, db_show_umacache) { uma_zone_t z; uint64_t allocs, frees; long cachefree; int i; db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Bucket"); LIST_FOREACH(z, &uma_cachezones, uz_link) { uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL); for (i = 0; i < vm_ndomains; i++) cachefree += ZDOM_GET(z, i)->uzd_nitems; db_printf("%18s %8ju %8jd %8ld %12ju %8u\n", z->uz_name, (uintmax_t)z->uz_size, (intmax_t)(allocs - frees), cachefree, (uintmax_t)allocs, z->uz_bucket_size); if (db_pager_quit) return; } } #endif /* DDB */ diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 7ab1fdb8950e..4ddeb3208260 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -1,956 +1,953 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Kernel memory management. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vm_map kernel_map_store; struct vm_map exec_map_store; struct vm_map pipe_map_store; const void *zero_region; CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0); /* NB: Used by kernel debuggers. */ const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS; u_int exec_map_entry_size; u_int exec_map_entries; SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD, SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address"); SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD, #if defined(__arm__) &vm_max_kernel_address, 0, #else SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS, #endif "Max kernel address"); #if VM_NRESERVLEVEL > 0 #define KVA_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) #else /* On non-superpage architectures we want large import sizes. */ #define KVA_QUANTUM_SHIFT (8 + PAGE_SHIFT) #endif #define KVA_QUANTUM (1ul << KVA_QUANTUM_SHIFT) #define KVA_NUMA_IMPORT_QUANTUM (KVA_QUANTUM * 128) extern void uma_startup2(void); /* * kva_alloc: * * Allocate a virtual address range with no underlying object and * no initial mapping to physical memory. Any mapping from this * range to physical memory must be explicitly created prior to * its use, typically with pmap_qenter(). Any attempt to create * a mapping on demand through vm_fault() will result in a panic. */ vm_offset_t kva_alloc(vm_size_t size) { vm_offset_t addr; size = round_page(size); if (vmem_alloc(kernel_arena, size, M_BESTFIT | M_NOWAIT, &addr)) return (0); return (addr); } /* * kva_free: * * Release a region of kernel virtual memory allocated * with kva_alloc, and return the physical pages * associated with that region. * * This routine may not block on kernel maps. */ void kva_free(vm_offset_t addr, vm_size_t size) { size = round_page(size); vmem_free(kernel_arena, addr, size); } static vm_page_t kmem_alloc_contig_pages(vm_object_t object, vm_pindex_t pindex, int domain, int pflags, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vm_page_t m; int tries; bool wait, reclaim; VM_OBJECT_ASSERT_WLOCKED(object); /* Disallow an invalid combination of flags. */ MPASS((pflags & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) != (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)); wait = (pflags & VM_ALLOC_WAITOK) != 0; reclaim = (pflags & VM_ALLOC_NORECLAIM) == 0; pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); pflags |= VM_ALLOC_NOWAIT; for (tries = wait ? 3 : 1;; tries--) { m = vm_page_alloc_contig_domain(object, pindex, domain, pflags, npages, low, high, alignment, boundary, memattr); if (m != NULL || tries == 0 || !reclaim) break; VM_OBJECT_WUNLOCK(object); if (!vm_page_reclaim_contig_domain(domain, pflags, npages, low, high, alignment, boundary) && wait) vm_wait_domain(domain); VM_OBJECT_WLOCK(object); } return (m); } /* * Allocates a region from the kernel address map and physical pages * within the specified address range to the kernel object. Creates a * wired mapping from this region to these pages, and returns the * region's starting virtual address. The allocated pages are not * necessarily physically contiguous. If M_ZERO is specified through the * given flags, then the pages are zeroed before they are mapped. */ static vm_offset_t kmem_alloc_attr_domain(int domain, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr) { vmem_t *vmem; vm_object_t object; vm_offset_t addr, i, offset; vm_page_t m; vm_size_t asize; int pflags; vm_prot_t prot; object = kernel_object; asize = round_page(size); vmem = vm_dom[domain].vmd_kernel_arena; if (vmem_alloc(vmem, asize, M_BESTFIT | flags, &addr)) return (0); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_WIRED; prot = (flags & M_EXEC) != 0 ? VM_PROT_ALL : VM_PROT_RW; VM_OBJECT_WLOCK(object); for (i = 0; i < asize; i += PAGE_SIZE) { m = kmem_alloc_contig_pages(object, atop(offset + i), domain, pflags, 1, low, high, PAGE_SIZE, 0, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); kmem_unback(object, addr, i); vmem_free(vmem, addr, asize); return (0); } KASSERT(vm_page_domain(m) == domain, ("kmem_alloc_attr_domain: Domain mismatch %d != %d", vm_page_domain(m), domain)); if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); pmap_enter(kernel_pmap, addr + i, m, prot, prot | PMAP_ENTER_WIRED, 0); } VM_OBJECT_WUNLOCK(object); kasan_mark((void *)addr, size, asize, KASAN_KMEM_REDZONE); return (addr); } vm_offset_t kmem_alloc_attr(vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr) { return (kmem_alloc_attr_domainset(DOMAINSET_RR(), size, flags, low, high, memattr)); } vm_offset_t kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_offset_t addr; int domain; vm_domainset_iter_policy_init(&di, ds, &domain, &flags); do { addr = kmem_alloc_attr_domain(domain, size, flags, low, high, memattr); if (addr != 0) break; } while (vm_domainset_iter_policy(&di, &domain) == 0); return (addr); } /* * Allocates a region from the kernel address map and physically * contiguous pages within the specified address range to the kernel * object. Creates a wired mapping from this region to these pages, and * returns the region's starting virtual address. If M_ZERO is specified * through the given flags, then the pages are zeroed before they are * mapped. */ static vm_offset_t kmem_alloc_contig_domain(int domain, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vmem_t *vmem; vm_object_t object; vm_offset_t addr, offset, tmp; vm_page_t end_m, m; vm_size_t asize; u_long npages; int pflags; object = kernel_object; asize = round_page(size); vmem = vm_dom[domain].vmd_kernel_arena; if (vmem_alloc(vmem, asize, flags | M_BESTFIT, &addr)) return (0); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_WIRED; npages = atop(asize); VM_OBJECT_WLOCK(object); m = kmem_alloc_contig_pages(object, atop(offset), domain, pflags, npages, low, high, alignment, boundary, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); vmem_free(vmem, addr, asize); return (0); } KASSERT(vm_page_domain(m) == domain, ("kmem_alloc_contig_domain: Domain mismatch %d != %d", vm_page_domain(m), domain)); end_m = m + npages; tmp = addr; for (; m < end_m; m++) { if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW, VM_PROT_RW | PMAP_ENTER_WIRED, 0); tmp += PAGE_SIZE; } VM_OBJECT_WUNLOCK(object); kasan_mark((void *)addr, size, asize, KASAN_KMEM_REDZONE); return (addr); } vm_offset_t kmem_alloc_contig(vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { return (kmem_alloc_contig_domainset(DOMAINSET_RR(), size, flags, low, high, alignment, boundary, memattr)); } vm_offset_t kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_offset_t addr; int domain; vm_domainset_iter_policy_init(&di, ds, &domain, &flags); do { addr = kmem_alloc_contig_domain(domain, size, flags, low, high, alignment, boundary, memattr); if (addr != 0) break; } while (vm_domainset_iter_policy(&di, &domain) == 0); return (addr); } /* * kmem_subinit: * * Initializes a map to manage a subrange * of the kernel virtual address space. * * Arguments are as follows: * * parent Map to take range from * min, max Returned endpoints of map * size Size of range to find * superpage_align Request that min is superpage aligned */ void kmem_subinit(vm_map_t map, vm_map_t parent, vm_offset_t *min, vm_offset_t *max, vm_size_t size, bool superpage_align) { int ret; size = round_page(size); *min = vm_map_min(parent); ret = vm_map_find(parent, NULL, 0, min, size, 0, superpage_align ? VMFS_SUPER_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_NO_CHARGE); if (ret != KERN_SUCCESS) panic("kmem_subinit: bad status return of %d", ret); *max = *min + size; vm_map_init(map, vm_map_pmap(parent), *min, *max); if (vm_map_submap(parent, *min, *max, map) != KERN_SUCCESS) panic("kmem_subinit: unable to change range to submap"); } /* * kmem_malloc_domain: * * Allocate wired-down pages in the kernel's address space. */ static vm_offset_t kmem_malloc_domain(int domain, vm_size_t size, int flags) { vmem_t *arena; vm_offset_t addr; vm_size_t asize; int rv; if (__predict_true((flags & M_EXEC) == 0)) arena = vm_dom[domain].vmd_kernel_arena; else arena = vm_dom[domain].vmd_kernel_rwx_arena; asize = round_page(size); if (vmem_alloc(arena, asize, flags | M_BESTFIT, &addr)) return (0); rv = kmem_back_domain(domain, kernel_object, addr, asize, flags); if (rv != KERN_SUCCESS) { vmem_free(arena, addr, asize); return (0); } kasan_mark((void *)addr, size, asize, KASAN_KMEM_REDZONE); return (addr); } vm_offset_t kmem_malloc(vm_size_t size, int flags) { return (kmem_malloc_domainset(DOMAINSET_RR(), size, flags)); } vm_offset_t kmem_malloc_domainset(struct domainset *ds, vm_size_t size, int flags) { struct vm_domainset_iter di; vm_offset_t addr; int domain; vm_domainset_iter_policy_init(&di, ds, &domain, &flags); do { addr = kmem_malloc_domain(domain, size, flags); if (addr != 0) break; } while (vm_domainset_iter_policy(&di, &domain) == 0); return (addr); } /* * kmem_back_domain: * * Allocate physical pages from the specified domain for the specified * virtual address range. */ int kmem_back_domain(int domain, vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { vm_offset_t offset, i; vm_page_t m, mpred; vm_prot_t prot; int pflags; KASSERT(object == kernel_object, ("kmem_back_domain: only supports kernel object.")); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_WIRED; pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); if (flags & M_WAITOK) pflags |= VM_ALLOC_WAITFAIL; prot = (flags & M_EXEC) != 0 ? VM_PROT_ALL : VM_PROT_RW; i = 0; VM_OBJECT_WLOCK(object); retry: mpred = vm_radix_lookup_le(&object->rtree, atop(offset + i)); for (; i < size; i += PAGE_SIZE, mpred = m) { m = vm_page_alloc_domain_after(object, atop(offset + i), domain, pflags, mpred); /* * Ran out of space, free everything up and return. Don't need * to lock page queues here as we know that the pages we got * aren't on any queues. */ if (m == NULL) { if ((flags & M_NOWAIT) == 0) goto retry; VM_OBJECT_WUNLOCK(object); kmem_unback(object, addr, i); return (KERN_NO_SPACE); } KASSERT(vm_page_domain(m) == domain, ("kmem_back_domain: Domain mismatch %d != %d", vm_page_domain(m), domain)); if (flags & M_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("kmem_malloc: page %p is managed", m)); vm_page_valid(m); pmap_enter(kernel_pmap, addr + i, m, prot, prot | PMAP_ENTER_WIRED, 0); if (__predict_false((prot & VM_PROT_EXECUTE) != 0)) m->oflags |= VPO_KMEM_EXEC; } VM_OBJECT_WUNLOCK(object); return (KERN_SUCCESS); } /* * kmem_back: * * Allocate physical pages for the specified virtual address range. */ int kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { vm_offset_t end, next, start; int domain, rv; KASSERT(object == kernel_object, ("kmem_back: only supports kernel object.")); for (start = addr, end = addr + size; addr < end; addr = next) { /* * We must ensure that pages backing a given large virtual page * all come from the same physical domain. */ if (vm_ndomains > 1) { domain = (addr >> KVA_QUANTUM_SHIFT) % vm_ndomains; while (VM_DOMAIN_EMPTY(domain)) domain++; next = roundup2(addr + 1, KVA_QUANTUM); if (next > end || next < start) next = end; } else { domain = 0; next = end; } rv = kmem_back_domain(domain, object, addr, next - addr, flags); if (rv != KERN_SUCCESS) { kmem_unback(object, start, addr - start); break; } } return (rv); } /* * kmem_unback: * * Unmap and free the physical pages underlying the specified virtual * address range. * * A physical page must exist within the specified object at each index * that is being unmapped. */ static struct vmem * _kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size) { struct vmem *arena; vm_page_t m, next; vm_offset_t end, offset; int domain; KASSERT(object == kernel_object, ("kmem_unback: only supports kernel object.")); if (size == 0) return (NULL); pmap_remove(kernel_pmap, addr, addr + size); offset = addr - VM_MIN_KERNEL_ADDRESS; end = offset + size; VM_OBJECT_WLOCK(object); m = vm_page_lookup(object, atop(offset)); domain = vm_page_domain(m); if (__predict_true((m->oflags & VPO_KMEM_EXEC) == 0)) arena = vm_dom[domain].vmd_kernel_arena; else arena = vm_dom[domain].vmd_kernel_rwx_arena; for (; offset < end; offset += PAGE_SIZE, m = next) { next = vm_page_next(m); vm_page_xbusy_claim(m); vm_page_unwire_noq(m); vm_page_free(m); } VM_OBJECT_WUNLOCK(object); return (arena); } void kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size) { (void)_kmem_unback(object, addr, size); } /* * kmem_free: * * Free memory allocated with kmem_malloc. The size must match the * original allocation. */ void kmem_free(vm_offset_t addr, vm_size_t size) { struct vmem *arena; size = round_page(size); kasan_mark((void *)addr, size, size, 0); arena = _kmem_unback(kernel_object, addr, size); if (arena != NULL) vmem_free(arena, addr, size); } /* * kmap_alloc_wait: * * Allocates pageable memory from a sub-map of the kernel. If the submap * has no room, the caller sleeps waiting for more memory in the submap. * * This routine may block. */ vm_offset_t kmap_alloc_wait(vm_map_t map, vm_size_t size) { vm_offset_t addr; size = round_page(size); if (!swap_reserve(size)) return (0); for (;;) { /* * To make this work for more than one map, use the map's lock * to lock out sleepers/wakers. */ vm_map_lock(map); addr = vm_map_findspace(map, vm_map_min(map), size); if (addr + size <= vm_map_max(map)) break; /* no space now; see if we can ever get space */ if (vm_map_max(map) - vm_map_min(map) < size) { vm_map_unlock(map); swap_release(size); return (0); } map->needs_wakeup = TRUE; vm_map_unlock_and_wait(map, 0); } vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_RW, VM_PROT_RW, MAP_ACC_CHARGED); vm_map_unlock(map); return (addr); } /* * kmap_free_wakeup: * * Returns memory to a submap of the kernel, and wakes up any processes * waiting for memory in that map. */ void kmap_free_wakeup(vm_map_t map, vm_offset_t addr, vm_size_t size) { vm_map_lock(map); (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); if (map->needs_wakeup) { map->needs_wakeup = FALSE; vm_map_wakeup(map); } vm_map_unlock(map); } void kmem_init_zero_region(void) { vm_offset_t addr, i; vm_page_t m; /* * Map a single physical page of zeros to a larger virtual range. * This requires less looping in places that want large amounts of * zeros, while not using much more physical resources. */ addr = kva_alloc(ZERO_REGION_SIZE); - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(m); + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE) pmap_qenter(addr + i, &m, 1); pmap_protect(kernel_pmap, addr, addr + ZERO_REGION_SIZE, VM_PROT_READ); zero_region = (const void *)addr; } /* * Import KVA from the kernel map into the kernel arena. */ static int kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp) { vm_offset_t addr; int result; KASSERT((size % KVA_QUANTUM) == 0, ("kva_import: Size %jd is not a multiple of %d", (intmax_t)size, (int)KVA_QUANTUM)); addr = vm_map_min(kernel_map); result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0, VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); if (result != KERN_SUCCESS) return (ENOMEM); *addrp = addr; return (0); } /* * Import KVA from a parent arena into a per-domain arena. Imports must be * KVA_QUANTUM-aligned and a multiple of KVA_QUANTUM in size. */ static int kva_import_domain(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp) { KASSERT((size % KVA_QUANTUM) == 0, ("kva_import_domain: Size %jd is not a multiple of %d", (intmax_t)size, (int)KVA_QUANTUM)); return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, addrp)); } /* * kmem_init: * * Create the kernel map; insert a mapping covering kernel text, * data, bss, and all space allocated thus far (`boostrap' data). The * new map will thus map the range between VM_MIN_KERNEL_ADDRESS and * `start' as allocated, and the range between `start' and `end' as free. * Create the kernel vmem arena and its per-domain children. */ void kmem_init(vm_offset_t start, vm_offset_t end) { vm_size_t quantum; int domain; vm_map_init(kernel_map, kernel_pmap, VM_MIN_KERNEL_ADDRESS, end); kernel_map->system_map = 1; vm_map_lock(kernel_map); /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ (void)vm_map_insert(kernel_map, NULL, 0, #ifdef __amd64__ KERNBASE, #else VM_MIN_KERNEL_ADDRESS, #endif start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); /* ... and ending with the completion of the above `insert' */ #ifdef __amd64__ /* * Mark KVA used for the page array as allocated. Other platforms * that handle vm_page_array allocation can simply adjust virtual_avail * instead. */ (void)vm_map_insert(kernel_map, NULL, 0, (vm_offset_t)vm_page_array, (vm_offset_t)vm_page_array + round_2mpage(vm_page_array_size * sizeof(struct vm_page)), VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); #endif vm_map_unlock(kernel_map); /* * Use a large import quantum on NUMA systems. This helps minimize * interleaving of superpages, reducing internal fragmentation within * the per-domain arenas. */ if (vm_ndomains > 1 && PMAP_HAS_DMAP) quantum = KVA_NUMA_IMPORT_QUANTUM; else quantum = KVA_QUANTUM; /* * Initialize the kernel_arena. This can grow on demand. */ vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0); vmem_set_import(kernel_arena, kva_import, NULL, NULL, quantum); for (domain = 0; domain < vm_ndomains; domain++) { /* * Initialize the per-domain arenas. These are used to color * the KVA space in a way that ensures that virtual large pages * are backed by memory from the same physical domain, * maximizing the potential for superpage promotion. */ vm_dom[domain].vmd_kernel_arena = vmem_create( "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); vmem_set_import(vm_dom[domain].vmd_kernel_arena, kva_import_domain, NULL, kernel_arena, quantum); /* * In architectures with superpages, maintain separate arenas * for allocations with permissions that differ from the * "standard" read/write permissions used for kernel memory, * so as not to inhibit superpage promotion. * * Use the base import quantum since this arena is rarely used. */ #if VM_NRESERVLEVEL > 0 vm_dom[domain].vmd_kernel_rwx_arena = vmem_create( "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena, kva_import_domain, (vmem_release_t *)vmem_xfree, kernel_arena, KVA_QUANTUM); #else vm_dom[domain].vmd_kernel_rwx_arena = vm_dom[domain].vmd_kernel_arena; #endif } /* * This must be the very first call so that the virtual address * space used for early allocations is properly marked used in * the map. */ uma_startup2(); } /* * kmem_bootstrap_free: * * Free pages backing preloaded data (e.g., kernel modules) to the * system. Currently only supported on platforms that create a * vm_phys segment for preloaded data. */ void kmem_bootstrap_free(vm_offset_t start, vm_size_t size) { #if defined(__i386__) || defined(__amd64__) struct vm_domain *vmd; vm_offset_t end, va; vm_paddr_t pa; vm_page_t m; end = trunc_page(start + size); start = round_page(start); #ifdef __amd64__ /* * Preloaded files do not have execute permissions by default on amd64. * Restore the default permissions to ensure that the direct map alias * is updated. */ pmap_change_prot(start, end - start, VM_PROT_RW); #endif for (va = start; va < end; va += PAGE_SIZE) { pa = pmap_kextract(va); m = PHYS_TO_VM_PAGE(pa); vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); vm_phys_free_pages(m, 0); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, 1); vm_cnt.v_page_count++; } pmap_remove(kernel_pmap, start, end); (void)vmem_add(kernel_arena, start, end - start, M_WAITOK); #endif } /* * Allow userspace to directly trigger the VM drain routine for testing * purposes. */ static int debug_vm_lowmem(SYSCTL_HANDLER_ARGS) { int error, i; i = 0; error = sysctl_handle_int(oidp, &i, 0, req); if (error != 0) return (error); if ((i & ~(VM_LOW_KMEM | VM_LOW_PAGES)) != 0) return (EINVAL); if (i != 0) EVENTHANDLER_INVOKE(vm_lowmem, i); return (0); } SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, 0, debug_vm_lowmem, "I", "set to trigger vm_lowmem event with given flags"); static int debug_uma_reclaim(SYSCTL_HANDLER_ARGS) { int error, i; i = 0; error = sysctl_handle_int(oidp, &i, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (i != UMA_RECLAIM_TRIM && i != UMA_RECLAIM_DRAIN && i != UMA_RECLAIM_DRAIN_CPU) return (EINVAL); uma_reclaim(i); return (0); } SYSCTL_PROC(_debug, OID_AUTO, uma_reclaim, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, 0, debug_uma_reclaim, "I", "set to generate request to reclaim uma caches"); static int debug_uma_reclaim_domain(SYSCTL_HANDLER_ARGS) { int domain, error, request; request = 0; error = sysctl_handle_int(oidp, &request, 0, req); if (error != 0 || req->newptr == NULL) return (error); domain = request >> 4; request &= 0xf; if (request != UMA_RECLAIM_TRIM && request != UMA_RECLAIM_DRAIN && request != UMA_RECLAIM_DRAIN_CPU) return (EINVAL); if (domain < 0 || domain >= vm_ndomains) return (EINVAL); uma_reclaim_domain(request, domain); return (0); } SYSCTL_PROC(_debug, OID_AUTO, uma_reclaim_domain, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, 0, debug_uma_reclaim_domain, "I", ""); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 74b09fc27be5..0190f67dd7da 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1,5576 +1,5575 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vm_domain vm_dom[MAXMEMDOM]; DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]); struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; struct mtx_padalign __exclusive_cache_line vm_domainset_lock; /* The following fields are protected by the domainset lock. */ domainset_t __exclusive_cache_line vm_min_domains; domainset_t __exclusive_cache_line vm_severe_domains; static int vm_min_waiters; static int vm_severe_waiters; static int vm_pageproc_waiters; static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM page statistics"); static COUNTER_U64_DEFINE_EARLY(pqstate_commit_retries); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, pqstate_commit_retries, CTLFLAG_RD, &pqstate_commit_retries, "Number of failed per-page atomic queue state updates"); static COUNTER_U64_DEFINE_EARLY(queue_ops); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops, CTLFLAG_RD, &queue_ops, "Number of batched queue operations"); static COUNTER_U64_DEFINE_EARLY(queue_nops); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops, CTLFLAG_RD, &queue_nops, "Number of batched queue operations with no effects"); /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. */ vm_page_t bogus_page; vm_page_t vm_page_array; long vm_page_array_size; long first_page; struct bitset *vm_page_dump; long vm_page_dump_pages; static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(vm_page_t m, uint8_t queue); static bool vm_page_free_prep(vm_page_t m); static void vm_page_free_toq(vm_page_t m); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); static void vm_page_mvqueue(vm_page_t m, const uint8_t queue, const uint16_t nflag); static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high); static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse); static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req); static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags); static void vm_page_zone_release(void *arg, void **store, int cnt); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); static void vm_page_init(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | - VM_ALLOC_NORMAL | VM_ALLOC_WIRED); + bogus_page = vm_page_alloc_noobj(VM_ALLOC_WIRED); } /* * The cache page zone is initialized later since we need to be able to allocate * pages before UMA is fully initialized. */ static void vm_page_init_cache_zones(void *dummy __unused) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int cache, domain, maxcache, pool; maxcache = 0; TUNABLE_INT_FETCH("vm.pgcache_zone_max_pcpu", &maxcache); maxcache *= mp_ncpus; for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); for (pool = 0; pool < VM_NFREEPOOL; pool++) { pgcache = &vmd->vmd_pgcache[pool]; pgcache->domain = domain; pgcache->pool = pool; pgcache->zone = uma_zcache_create("vm pgcache", PAGE_SIZE, NULL, NULL, NULL, NULL, vm_page_zone_import, vm_page_zone_release, pgcache, UMA_ZONE_VM); /* * Limit each pool's zone to 0.1% of the pages in the * domain. */ cache = maxcache != 0 ? maxcache : vmd->vmd_page_count / 1000; uma_zone_set_maxcache(pgcache->zone, cache); } } } SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose) { struct vm_domain *vmd; vm_page_t m; int ret; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) return (true); /* page does not exist, no failure */ vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); ret = vm_phys_unfree_page(m); vm_domain_free_unlock(vmd); if (ret != 0) { vm_domain_freecnt_inc(vmd, -1); TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (verbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } return (ret); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; char *next; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; vm_page_blacklist_add(pa, bootverbose); } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Initialize a dummy page for use in scans of the specified paging queue. * In principle, this function only needs to set the flag PG_MARKER. * Nonetheless, it write busies the page as a safety precaution. */ void vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->a.flags = aflags; marker->busy_lock = VPB_CURTHREAD_EXCLUSIVE; marker->a.queue = queue; } static void vm_page_domain_init(int domain) { struct vm_domain *vmd; struct vm_pagequeue *pq; int i; vmd = VM_DOMAIN(domain); bzero(vmd, sizeof(*vmd)); *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; vmd->vmd_domain = domain; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); pq->pq_pdpages = 0; vm_page_init_marker(&vmd->vmd_markers[i], i, 0); } mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF); snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain); /* * inacthead is used to provide FIFO ordering for LRU-bypassing * insertions. */ vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, &vmd->vmd_inacthead, plinks.q); /* * The clock pages are used to implement active queue scanning without * requeues. Scans start at clock[0], which is advanced after the scan * ends. When the two clock hands meet, they are reset and scanning * resumes from the head of the queue. */ vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED); vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[1], plinks.q); } /* * Initialize a physical page in preparation for adding it to the free * lists. */ void vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) { m->object = NULL; m->ref_count = 0; m->busy_lock = VPB_FREED; m->flags = m->a.flags = 0; m->phys_addr = pa; m->a.queue = PQ_NONE; m->psind = 0; m->segind = segind; m->order = VM_NFREEORDER; m->pool = VM_FREEPOOL_DEFAULT; m->valid = m->dirty = 0; pmap_page_init(m); } #ifndef PMAP_HAS_PAGE_ARRAY static vm_paddr_t vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range) { vm_paddr_t new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. * However, because this page is allocated from KVM, out-of-bounds * accesses using the direct map will not be trapped. */ *vaddr += PAGE_SIZE; /* * Allocate physical memory for the page structures, and map it. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array_size = page_range; return (new_end); } #endif /* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for * bootstrapping UMA and some data structures that are used to manage * physical pages. Initializes these structures, and populates the free * page queues. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { struct vm_phys_seg *seg; struct vm_domain *vmd; vm_page_t m; char *list, *listend; vm_paddr_t end, high_avail, low_avail, new_end, size; vm_paddr_t page_range __unused; vm_paddr_t last_pa, pa, startp, endp; u_long pagecount; #if MINIDUMP_PAGE_TRACKING u_long vm_page_dump_size; #endif int biggestone, i, segind; #ifdef WITNESS vm_offset_t mapped; int witness_size; #endif #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) long ii; #endif vaddr = round_page(vaddr); vm_phys_early_startup(); biggestone = vm_phys_avail_largest(); end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(i); new_end = end; #ifdef WITNESS witness_size = round_page(witness_startup_count()); new_end -= witness_size; mapped = pmap_map(&vaddr, new_end, new_end + witness_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, witness_size); witness_startup((void *)mapped); #endif #if MINIDUMP_PAGE_TRACKING /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; vm_page_dump_pages = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) { vm_page_dump_pages += howmany(dump_avail[i + 1], PAGE_SIZE) - dump_avail[i] / PAGE_SIZE; if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; } vm_page_dump_size = round_page(BITSET_SIZE(vm_page_dump_pages)); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #else (void)last_pa; #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) || defined(__powerpc64__) /* * Include the UMA bootstrap pages, witness pages and vm_page_dump * in a crash dump. When pmap_map() uses the direct map, they are * not automatically included. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use, taking into account the overhead of a page structure per page. * In other words, solve * "available physical memory" - round_page(page_range * * sizeof(struct vm_page)) = page_range * PAGE_SIZE * for page_range. */ low_avail = phys_avail[0]; high_avail = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_avail) low_avail = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_avail) high_avail = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i] < low_avail) low_avail = phys_avail[i]; if (phys_avail[i + 1] > high_avail) high_avail = phys_avail[i + 1]; } first_page = low_avail / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE size = 0; for (i = 0; i < vm_phys_nsegs; i++) size += vm_phys_segs[i].end - vm_phys_segs[i].start; for (i = 0; phys_avail[i + 1] != 0; i += 2) size += phys_avail[i + 1] - phys_avail[i]; #elif defined(VM_PHYSSEG_DENSE) size = high_avail - low_avail; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif #ifdef PMAP_HAS_PAGE_ARRAY pmap_page_array_startup(size / PAGE_SIZE); biggestone = vm_phys_avail_largest(); end = new_end = phys_avail[biggestone + 1]; #else #ifdef VM_PHYSSEG_DENSE /* * In the VM_PHYSSEG_DENSE case, the number of pages can account for * the overhead of a page structure per page only if vm_page_array is * allocated from the last physical memory chunk. Otherwise, we must * allocate page structures representing the physical memory * underlying vm_page_array, even though they will not be used. */ if (new_end != high_avail) page_range = size / PAGE_SIZE; else #endif { page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); /* * If the partial bytes remaining are large enough for * a page (PAGE_SIZE) without a corresponding * 'struct vm_page', then new_end will contain an * extra page after subtracting the length of the VM * page array. Compensate by subtracting an extra * page from new_end. */ if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { if (new_end == high_avail) high_avail -= PAGE_SIZE; new_end -= PAGE_SIZE; } } end = new_end; new_end = vm_page_array_alloc(&vaddr, end, page_range); #endif #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's * data structures, and map it. */ new_end = vm_reserv_startup(&vaddr, new_end); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) || defined(__powerpc64__) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) if (vm_phys_avail_size(i) != 0) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Initialize the page structures and add every available page to the * physical memory allocator's free lists. */ #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) for (ii = 0; ii < vm_page_array_size; ii++) { m = &vm_page_array[ii]; vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0); m->flags = PG_FICTITIOUS; } #endif vm_cnt.v_page_count = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; for (m = seg->first_page, pa = seg->start; pa < seg->end; m++, pa += PAGE_SIZE) vm_page_init_page(m, pa, segind); /* * Add the segment's pages that are covered by one of * phys_avail's ranges to the free lists. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (seg->end <= phys_avail[i] || seg->start >= phys_avail[i + 1]) continue; startp = MAX(seg->start, phys_avail[i]); endp = MIN(seg->end, phys_avail[i + 1]); pagecount = (u_long)atop(endp - startp); if (pagecount == 0) continue; m = seg->first_page + atop(startp - seg->start); vmd = VM_DOMAIN(seg->domain); vm_domain_free_lock(vmd); vm_phys_enqueue_contig(m, pagecount); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, pagecount); vm_cnt.v_page_count += (u_int)pagecount; vmd->vmd_page_count += (u_int)pagecount; vmd->vmd_segs |= 1UL << segind; } } /* * Remove blacklisted pages from the physical memory allocator. */ TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_trybusy * * Helper routine for grab functions to trylock busy. * * Returns true on success and false on failure. */ static bool vm_page_trybusy(vm_page_t m, int allocflags) { if ((allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0) return (vm_page_trysbusy(m)); else return (vm_page_tryxbusy(m)); } /* * vm_page_tryacquire * * Helper routine for grab functions to trylock busy and wire. * * Returns true on success and false on failure. */ static inline bool vm_page_tryacquire(vm_page_t m, int allocflags) { bool locked; locked = vm_page_trybusy(m, allocflags); if (locked && (allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); return (locked); } /* * vm_page_busy_acquire: * * Acquire the busy lock as described by VM_ALLOC_* flags. Will loop * and drop the object lock if necessary. */ bool vm_page_busy_acquire(vm_page_t m, int allocflags) { vm_object_t obj; bool locked; /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = atomic_load_ptr(&m->object); for (;;) { if (vm_page_tryacquire(m, allocflags)) return (true); if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (false); if (obj != NULL) locked = VM_OBJECT_WOWNED(obj); else locked = false; MPASS(locked || vm_page_wired(m)); if (_vm_page_busy_sleep(obj, m, m->pindex, "vmpba", allocflags, locked) && locked) VM_OBJECT_WLOCK(obj); if ((allocflags & VM_ALLOC_WAITFAIL) != 0) return (false); KASSERT(m->object == obj || m->object == NULL, ("vm_page_busy_acquire: page %p does not belong to %p", m, obj)); } } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; vm_page_assert_xbusied(m); x = vm_page_busy_fetch(m); for (;;) { if (atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_SHARERS_WORD(1))) break; } if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); } /* * * vm_page_busy_tryupgrade: * * Attempt to upgrade a single shared busy into an exclusive busy. */ int vm_page_busy_tryupgrade(vm_page_t m) { u_int ce, x; vm_page_assert_sbusied(m); x = vm_page_busy_fetch(m); ce = VPB_CURTHREAD_EXCLUSIVE; for (;;) { if (VPB_SHARERS(x) > 1) return (0); KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), ("vm_page_busy_tryupgrade: invalid lock state")); if (!atomic_fcmpset_acq_int(&m->busy_lock, &x, ce | (x & VPB_BIT_WAITERS))) continue; return (1); } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = vm_page_busy_fetch(m); return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); x = vm_page_busy_fetch(m); for (;;) { KASSERT(x != VPB_FREED, ("vm_page_sunbusy: Unlocking freed page.")); if (VPB_SHARERS(x) > 1) { if (atomic_fcmpset_int(&m->busy_lock, &x, x - VPB_ONE_SHARER)) break; continue; } KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) continue; if ((x & VPB_BIT_WAITERS) == 0) break; wakeup(m); break; } } /* * vm_page_busy_sleep: * * Sleep if the page is busy, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * If nonshared is true, sleep only if the page is xbusy. * * The object lock must be held on entry and will be released on exit. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { vm_object_t obj; obj = m->object; VM_OBJECT_ASSERT_LOCKED(obj); vm_page_lock_assert(m, MA_NOTOWNED); if (!_vm_page_busy_sleep(obj, m, m->pindex, wmesg, nonshared ? VM_ALLOC_SBUSY : 0 , true)) VM_OBJECT_DROP(obj); } /* * vm_page_busy_sleep_unlocked: * * Sleep if the page is busy, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * If nonshared is true, sleep only if the page is xbusy. * * The object lock must not be held on entry. The operation will * return if the page changes identity. */ void vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, bool nonshared) { VM_OBJECT_ASSERT_UNLOCKED(obj); vm_page_lock_assert(m, MA_NOTOWNED); _vm_page_busy_sleep(obj, m, pindex, wmesg, nonshared ? VM_ALLOC_SBUSY : 0, false); } /* * _vm_page_busy_sleep: * * Internal busy sleep function. Verifies the page identity and * lockstate against parameters. Returns true if it sleeps and * false otherwise. * * If locked is true the lock will be dropped for any true returns * and held for any false returns. */ static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked) { bool xsleep; u_int x; /* * If the object is busy we must wait for that to drain to zero * before trying the page again. */ if (obj != NULL && vm_object_busied(obj)) { if (locked) VM_OBJECT_DROP(obj); vm_object_busy_wait(obj, wmesg); return (true); } if (!vm_page_busied(m)) return (false); xsleep = (allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0; sleepq_lock(m); x = vm_page_busy_fetch(m); do { /* * If the page changes objects or becomes unlocked we can * simply return. */ if (x == VPB_UNBUSIED || (xsleep && (x & VPB_BIT_SHARED) != 0) || m->object != obj || m->pindex != pindex) { sleepq_release(m); return (false); } if ((x & VPB_BIT_WAITERS) != 0) break; } while (!atomic_fcmpset_int(&m->busy_lock, &x, x | VPB_BIT_WAITERS)); if (locked) VM_OBJECT_DROP(obj); DROP_GIANT(); sleepq_add(m, NULL, wmesg, 0, 0); sleepq_wait(m, PVM); PICKUP_GIANT(); return (true); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { vm_object_t obj; u_int x; obj = m->object; x = vm_page_busy_fetch(m); for (;;) { if ((x & VPB_BIT_SHARED) == 0) return (0); /* * Reduce the window for transient busies that will trigger * false negatives in vm_page_ps_test(). */ if (obj != NULL && vm_object_busied(obj)) return (0); if (atomic_fcmpset_acq_int(&m->busy_lock, &x, x + VPB_ONE_SHARER)) break; } /* Refetch the object now that we're guaranteed that it is stable. */ obj = m->object; if (obj != NULL && vm_object_busied(obj)) { vm_page_sunbusy(m); return (0); } return (1); } /* * vm_page_tryxbusy: * * Try to exclusive busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_tryxbusy(vm_page_t m) { vm_object_t obj; if (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED, VPB_CURTHREAD_EXCLUSIVE) == 0) return (0); obj = m->object; if (obj != NULL && vm_object_busied(obj)) { vm_page_xunbusy(m); return (0); } return (1); } static void vm_page_xunbusy_hard_tail(vm_page_t m) { atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); /* Wake the waiter. */ wakeup(m); } /* * vm_page_xunbusy_hard: * * Called when unbusy has failed because there is a waiter. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_xunbusy_hard_tail(m); } void vm_page_xunbusy_hard_unchecked(vm_page_t m) { vm_page_assert_xbusied_unchecked(m); vm_page_xunbusy_hard_tail(m); } static void vm_page_busy_free(vm_page_t m) { u_int x; atomic_thread_fence_rel(); x = atomic_swap_int(&m->busy_lock, VPB_FREED); if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { for (; count != 0; count--) { vm_page_unwire(*ma, PQ_ACTIVE); ma++; } } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->a.queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; /* Fictitious pages are unevictable. */ m->ref_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); vm_page_assert_xbusied(m); vm_page_busy_free(m); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ if ((vm_page_busy_fetch(m) & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_xunbusy_unchecked(m); } /* * Destroy the identity of an invalid page and free it if possible. * This is intended to be used when reading a page from backing store fails. */ void vm_page_free_invalid(vm_page_t m) { KASSERT(vm_page_none_valid(m), ("page %p is valid", m)); KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m)); KASSERT(m->object != NULL, ("page %p has no object", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); /* * We may be attempting to free the page as part of the handling for an * I/O error, in which case the page was xbusied by a different thread. */ vm_page_xbusy_claim(m); /* * If someone has wired this page while the object lock * was not held, then the thread that unwires is responsible * for freeing the page. Otherwise just free the page now. * The wire count of this unmapped page cannot change while * we have the page xbusy and the page's object wlocked. */ if (vm_page_remove(m)) vm_page_free(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the object lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *wmesg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; if (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, 0, true)) { VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_sleep_if_xbusy: * * Sleep and release the object lock if the page is xbusied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_xbusy(vm_page_t m, const char *wmesg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; if (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, VM_ALLOC_SBUSY, true)) { VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page. */ m->object = object; m->pindex = pindex; m->ref_count |= VPRC_OBJREF; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; m->ref_count &= ~VPRC_OBJREF; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("vm_page_insert_radixdone: page %p is missing object ref", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_radixdone: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_radixdone: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's generation count. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * Do the work to remove a page from its object. The caller is responsible for * updating the page's fields to reflect this removal. */ static void vm_page_object_remove(vm_page_t m) { vm_object_t object; vm_page_t mrem; vm_page_assert_xbusied(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("page %p is missing its object ref", m)); /* Deferred free of swap space. */ if ((m->a.flags & PGA_SWAP_FREE) != 0) vm_pager_page_unswapped(m); m->object = NULL; mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); } /* * vm_page_remove: * * Removes the specified page from its containing object, but does not * invalidate any backing storage. Returns true if the object's reference * was the last reference to the page, and false otherwise. * * The object must be locked and the page must be exclusively busied. * The exclusive busy will be released on return. If this is not the * final ref and the caller does not hold a wire reference it may not * continue to access the page. */ bool vm_page_remove(vm_page_t m) { bool dropped; dropped = vm_page_remove_xbusy(m); vm_page_xunbusy(m); return (dropped); } /* * vm_page_remove_xbusy * * Removes the page but leaves the xbusy held. Returns true if this * removed the final ref and false otherwise. */ bool vm_page_remove_xbusy(vm_page_t m) { vm_page_object_remove(m); return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF); } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_lookup_unlocked: * * Returns the page associated with the object/offset pair specified; * if none is found, NULL is returned. The page may be no longer be * present in the object at the time that this function returns. Only * useful for opportunistic checks such as inmem(). */ vm_page_t vm_page_lookup_unlocked(vm_object_t object, vm_pindex_t pindex) { return (vm_radix_lookup_unlocked(&object->rtree, pindex)); } /* * vm_page_relookup: * * Returns a page that must already have been busied by * the caller. Used for bogus page replacement. */ vm_page_t vm_page_relookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; m = vm_radix_lookup_unlocked(&object->rtree, pindex); KASSERT(m != NULL && (vm_page_busied(m) || vm_page_wired(m)) && m->object == object && m->pindex == pindex, ("vm_page_relookup: Invalid page %p", m)); return (m); } /* * This should only be used by lockless functions for releasing transient * incorrect acquires. The page may have been freed after we acquired a * busy lock. In this case busy_lock == VPB_FREED and we have nothing * further to do. */ static void vm_page_busy_release(vm_page_t m) { u_int x; x = vm_page_busy_fetch(m); for (;;) { if (x == VPB_FREED) break; if ((x & VPB_BIT_SHARED) != 0 && VPB_SHARERS(x) > 1) { if (atomic_fcmpset_int(&m->busy_lock, &x, x - VPB_ONE_SHARER)) break; continue; } KASSERT((x & VPB_BIT_SHARED) != 0 || (x & ~VPB_BIT_WAITERS) == VPB_CURTHREAD_EXCLUSIVE, ("vm_page_busy_release: %p xbusy not owned.", m)); if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) continue; if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); break; } } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * Both pages must be exclusively busied on enter. The old page is * unbusied on exit. * * A return value of true means mold is now free. If this is not the * final ref and the caller does not hold a wire reference it may not * continue to access the page. */ static bool vm_page_replace_hold(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_t mret; bool dropped; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_assert_xbusied(mold); KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0, ("vm_page_replace: page %p already in object", mnew)); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; atomic_set_int(&mnew->ref_count, VPRC_OBJREF); mret = vm_radix_replace(&object->rtree, mnew); KASSERT(mret == mold, ("invalid page replacement, mold=%p, mret=%p", mold, mret)); KASSERT((mold->oflags & VPO_UNMANAGED) == (mnew->oflags & VPO_UNMANAGED), ("vm_page_replace: mismatched VPO_UNMANAGED")); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; /* * The object's resident_page_count does not change because we have * swapped one page for another, but the generation count should * change if the page is dirty. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); dropped = vm_page_drop(mold, VPRC_OBJREF) == VPRC_OBJREF; vm_page_xunbusy(mold); return (dropped); } void vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_assert_xbusied(mnew); if (vm_page_replace_hold(mnew, object, pindex, mold)) vm_page_free(mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); KASSERT(m->ref_count != 0, ("vm_page_rename: page %p has no refs", m)); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_object_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { return (vm_page_alloc_after(object, pindex, req, object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : NULL)); } vm_page_t vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req) { return (vm_page_alloc_domain_after(object, pindex, domain, req, object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : NULL)); } /* * Allocate a page in the specified object with the given page index. To * optimize insertion of the page into the object, the caller must also specifiy * the resident page in the object with largest index smaller than the given * page index, or NULL if no such page exists. */ vm_page_t vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t mpred) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } /* * Returns true if the number of free pages exceeds the minimum * for the request class and false otherwise. */ static int _vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages) { u_int limit, old, new; if (req_class == VM_ALLOC_INTERRUPT) limit = 0; else if (req_class == VM_ALLOC_SYSTEM) limit = vmd->vmd_interrupt_free_min; else limit = vmd->vmd_free_reserved; /* * Attempt to reserve the pages. Fail if we're below the limit. */ limit += npages; old = vmd->vmd_free_count; do { if (old < limit) return (0); new = old - npages; } while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0); /* Wake the page daemon if we've crossed the threshold. */ if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old)) pagedaemon_wakeup(vmd->vmd_domain); /* Only update bitsets on transitions. */ if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) || (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe)) vm_domain_set(vmd); return (1); } int vm_domain_allocate(struct vm_domain *vmd, int req, int npages) { int req_class; /* * The page daemon is allowed to dig deeper into the free page list. */ req_class = req & VM_ALLOC_CLASS_MASK; if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; return (_vm_domain_allocate(vmd, req_class, npages)); } vm_page_t vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, int req, vm_page_t mpred) { struct vm_domain *vmd; vm_page_t m; int flags, pool; KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("inconsistent object(%p)/req(%x)", object, req)); KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, ("Can't sleep and retry object insertion.")); KASSERT(mpred == NULL || mpred->pindex < pindex, ("mpred %p doesn't precede pindex 0x%jx", mpred, (uintmax_t)pindex)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); flags = 0; m = NULL; pool = object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT; again: #if VM_NRESERVLEVEL > 0 /* * Can we allocate the page from a reservation? */ if (vm_object_reserv(object) && (m = vm_reserv_alloc_page(object, pindex, domain, req, mpred)) != NULL) { goto found; } #endif vmd = VM_DOMAIN(domain); if (vmd->vmd_pgcache[pool].zone != NULL) { m = uma_zalloc(vmd->vmd_pgcache[pool].zone, M_NOWAIT | M_NOVM); if (m != NULL) { flags |= PG_PCPU_CACHE; goto found; } } if (vm_domain_allocate(vmd, req, 1)) { /* * If not, allocate it from the free page queues. */ vm_domain_free_lock(vmd); m = vm_phys_alloc_pages(domain, pool, 0); vm_domain_free_unlock(vmd); if (m == NULL) { vm_domain_freecnt_inc(vmd, 1); #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_inactive(domain)) goto again; #endif } } if (m == NULL) { /* * Not allocatable, give up. */ if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } /* * At this point we had better have found a good page. */ found: vm_page_dequeue(m); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ if ((req & VM_ALLOC_ZERO) != 0) flags |= (m->flags & PG_ZERO); if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->a.flags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; else if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); else m->busy_lock = VPB_UNBUSIED; if (req & VM_ALLOC_WIRED) { vm_wire_add(1); m->ref_count = 1; } m->a.act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { if (req & VM_ALLOC_WIRED) { vm_wire_sub(1); m->ref_count = 0; } KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; return (m); } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The specified object may not contain fictitious pages. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } vm_page_t vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domain *vmd; vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, ("Can't sleep and retry object insertion.")); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_FICTITIOUS) == 0, ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc_contig: pindex already allocated")); } /* * Can we allocate the pages without the number of free pages falling * below the lower bound for the allocation class? */ m_ret = NULL; again: #if VM_NRESERVLEVEL > 0 /* * Can we allocate the pages from a reservation? */ if (vm_object_reserv(object) && (m_ret = vm_reserv_alloc_contig(object, pindex, domain, req, mpred, npages, low, high, alignment, boundary)) != NULL) { goto found; } #endif vmd = VM_DOMAIN(domain); if (vm_domain_allocate(vmd, req, npages)) { /* * allocate them from the free page queues. */ vm_domain_free_lock(vmd); m_ret = vm_phys_alloc_contig(domain, npages, low, high, alignment, boundary); vm_domain_free_unlock(vmd); if (m_ret == NULL) { vm_domain_freecnt_inc(vmd, npages); #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(domain, npages, low, high, alignment, boundary)) goto again; #endif } } if (m_ret == NULL) { if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } #if VM_NRESERVLEVEL > 0 found: #endif for (m = m_ret; m < &m_ret[npages]; m++) { vm_page_dequeue(m); vm_page_alloc_check(m); } /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) busy_lock = VPB_CURTHREAD_EXCLUSIVE; else if ((req & VM_ALLOC_SBUSY) != 0) busy_lock = VPB_SHARERS_WORD(1); else busy_lock = VPB_UNBUSIED; if ((req & VM_ALLOC_WIRED) != 0) vm_wire_add(npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { m->a.flags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->ref_count = 1; m->a.act_count = 0; m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { if ((req & VM_ALLOC_WIRED) != 0) vm_wire_sub(npages); KASSERT(m->object == NULL, ("page %p has object", m)); mpred = m; for (m = m_ret; m < &m_ret[npages]; m++) { if (m <= mpred && (req & VM_ALLOC_WIRED) != 0) m->ref_count = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); } if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } return (m_ret); } /* * Allocate a physical page that is not intended to be inserted into a VM * object. If the "freelist" parameter is not equal to VM_NFREELIST, then only * pages from the specified vm_phys freelist will be returned. */ static __always_inline vm_page_t _vm_page_alloc_noobj_domain(int domain, const int freelist, int req) { struct vm_domain *vmd; vm_page_t m; int flags; KASSERT((req & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOOBJ)) == 0, ("%s: invalid req %#x", __func__, req)); flags = (req & VM_ALLOC_NODUMP) != 0 ? PG_NODUMP : 0; vmd = VM_DOMAIN(domain); again: if (freelist == VM_NFREELIST && vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone != NULL) { m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone, M_NOWAIT | M_NOVM); if (m != NULL) { flags |= PG_PCPU_CACHE; goto found; } } if (vm_domain_allocate(vmd, req, 1)) { vm_domain_free_lock(vmd); if (freelist == VM_NFREELIST) m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DIRECT, 0); else m = vm_phys_alloc_freelist_pages(domain, freelist, VM_FREEPOOL_DIRECT, 0); vm_domain_free_unlock(vmd); if (m == NULL) vm_domain_freecnt_inc(vmd, 1); } if (m == NULL) { if (vm_domain_alloc_fail(vmd, NULL, req)) goto again; return (NULL); } found: vm_page_dequeue(m); vm_page_alloc_check(m); /* Consumers should not rely on a useful default pindex value. */ m->pindex = 0xdeadc0dedeadc0de; m->flags = (m->flags & PG_ZERO) | flags; m->a.flags = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; if ((req & VM_ALLOC_WIRED) != 0) { vm_wire_add(1); m->ref_count = 1; } if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } vm_page_t vm_page_alloc_freelist(int freelist, int req) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_freelist_domain(domain, freelist, req); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_freelist_domain(int domain, int freelist, int req) { KASSERT(freelist >= 0 && freelist < VM_NFREELIST, ("%s: invalid freelist %d", __func__, freelist)); return (_vm_page_alloc_noobj_domain(domain, freelist, req)); } vm_page_t vm_page_alloc_noobj(int req) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_noobj_domain(domain, req); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_noobj_domain(int domain, int req) { return (_vm_page_alloc_noobj_domain(domain, VM_NFREELIST, req)); } /* * Check a page that has been freshly dequeued from a freelist. */ static void vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->a.queue == PQ_NONE && (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, ("page %p has unexpected queue %d, flags %#x", m, m->a.queue, (m->a.flags & PGA_QUEUE_STATE_MASK))); KASSERT(m->ref_count == 0, ("page %p has references", m)); KASSERT(vm_page_busy_freed(m), ("page %p is not freed", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); pmap_vm_page_alloc_check(m); } static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); /* * The page daemon should avoid creating extra memory pressure since its * main purpose is to replenish the store of free pages. */ if (vmd->vmd_severeset || curproc == pageproc || !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt)) return (0); domain = vmd->vmd_domain; vm_domain_free_lock(vmd); i = vm_phys_alloc_npages(domain, pgcache->pool, cnt, (vm_page_t *)store); vm_domain_free_unlock(vmd); if (cnt != i) vm_domain_freecnt_inc(vmd, cnt - i); return (i); } static void vm_page_zone_release(void *arg, void **store, int cnt) { struct vm_domain *vmd; struct vm_pgcache *pgcache; vm_page_t m; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); vm_domain_free_lock(vmd); for (i = 0; i < cnt; i++) { m = (vm_page_t)store[i]; vm_phys_free_pages(m, 0); } vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & PG_MARKER) == 0, ("page %p is PG_MARKER", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1, ("fictitious page %p has invalid ref count", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if ((pa & (alignment - 1)) != 0) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (rounddown2(pa ^ (pa + ptoa(npages) - 1), boundary) != 0) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); retry: m_inc = 1; if (vm_page_wired(m)) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = atomic_load_ptr(&m->object)) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ VM_OBJECT_RLOCK(object); if (object != m->object) { VM_OBJECT_RUNLOCK(object); goto retry; } /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && (object->flags & OBJ_SWAP) == 0 && object->type != OBJT_VNODE) { run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: PGA_NOSYNC. */ run_ext = 1; } else run_ext = 0; VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In * other words, it is still free. Extend the current * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it is the * first page in a power-of-two-sized run of * contiguous free pages. Add these pages to the end * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's * free page queues. However, it is not the first * page in a run of contiguous free pages. (This case * rarely occurs because the scan is performed in * ascending order.) (2) It is not reserved, and it is * transitioning from free to allocated. (Conversely, * the transition from allocated to free for managed * pages is blocked by the page busy lock.) (3) It is * allocated but not contained by an object and not * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct vm_domain *vmd; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Racily check for wirings. Races are handled once the object * lock is held and the page is unmapped. */ if (vm_page_wired(m)) error = EBUSY; else if ((object = atomic_load_ptr(&m->object)) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ VM_OBJECT_WLOCK(object); /* Don't care: PG_NODUMP, PG_ZERO. */ if (m->object != object || (object->type != OBJT_DEFAULT && (object->flags & OBJ_SWAP) == 0 && object->type != OBJT_VNODE)) error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (vm_page_queue(m) != PQ_NONE && vm_page_tryxbusy(m) != 0) { if (vm_page_wired(m)) { vm_page_xunbusy(m); error = EBUSY; goto unlock; } KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT(m->oflags == 0, ("page %p has unexpected oflags", m)); /* Don't care: PGA_NOSYNC. */ if (!vm_page_none_valid(m)) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class | VM_ALLOC_NOOBJ; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_contig( NULL, 0, req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_contig( NULL, 0, req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_contig( NULL, 0, req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { vm_page_xunbusy(m); error = ENOMEM; goto unlock; } /* * Unmap the page and check for new * wirings that may have been acquired * through a pmap lookup. */ if (object->ref_count != 0 && !vm_page_try_remove_all(m)) { vm_page_xunbusy(m); vm_page_free(m_new); error = EBUSY; goto unlock; } /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ m_new->a.flags = m->a.flags & ~PGA_QUEUE_STATE_MASK; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m_new)); m_new->oflags = 0; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_dequeue(m); if (vm_page_replace_hold(m_new, object, m->pindex, m) && vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); /* * The new page must be deactivated * before the object is unlocked. */ vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_dequeue(m); if (vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { MPASS(vm_page_domain(m) == domain); vmd = VM_DOMAIN(domain); vm_domain_free_lock(vmd); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it * is the first page in a power-of-two-sized * run of contiguous free pages. Jump ahead * to the last page within that run, and * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif vm_domain_free_unlock(vmd); if (order == VM_NFREEORDER) error = EINVAL; } } if ((m = SLIST_FIRST(&free)) != NULL) { int cnt; vmd = VM_DOMAIN(domain); cnt = 0; vm_domain_free_lock(vmd); do { MPASS(vm_page_domain(m) == domain); SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_phys_free_pages(m, 0); cnt++; } while ((m = SLIST_FIRST(&free)) != NULL); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may * fail due to a shortage of free pages. When reclamation fails, callers * are expected to perform vm_wait() before retrying a failed allocation * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domain *vmd; vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, minalign, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); /* * The caller will attempt an allocation after some runs have been * reclaimed and added to the vm_phys buddy lists. Due to limitations * of vm_phys_alloc_contig(), round up the requested length to the next * power of two or maximum chunk size, and ensure that each run is * suitably aligned. */ minalign = 1ul << imin(flsl(npages - 1), VM_NFREEORDER - 1); npages = roundup2(npages, minalign); if (alignment < ptoa(minalign)) alignment = ptoa(minalign); /* * The page daemon is allowed to dig deeper into the free page list. */ req_class = req & VM_ALLOC_CLASS_MASK; if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Return if the number of free pages cannot satisfy the requested * allocation. */ vmd = VM_DOMAIN(domain); count = vmd->vmd_free_count; if (count < npages + vmd->vmd_free_reserved || (count < npages + vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(domain, npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, domain, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domainset_iter di; int domain; bool ret; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { ret = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary); if (ret) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (ret); } /* * Set the domain in the appropriate page level domainset. */ void vm_domain_set(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (!vmd->vmd_minset && vm_paging_min(vmd)) { vmd->vmd_minset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains); } if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { vmd->vmd_severeset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains); } mtx_unlock(&vm_domainset_lock); } /* * Clear the domain from the appropriate page level domainset. */ void vm_domain_clear(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_minset && !vm_paging_min(vmd)) { vmd->vmd_minset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains); if (vm_min_waiters != 0) { vm_min_waiters = 0; wakeup(&vm_min_domains); } } if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { vmd->vmd_severeset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); if (vm_severe_waiters != 0) { vm_severe_waiters = 0; wakeup(&vm_severe_domains); } } /* * If pageout daemon needs pages, then tell it that there are * some free. */ if (vmd->vmd_pageout_pages_needed && vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { wakeup(&vmd->vmd_pageout_pages_needed); vmd->vmd_pageout_pages_needed = 0; } /* See comments in vm_wait_doms(). */ if (vm_pageproc_waiters) { vm_pageproc_waiters = 0; wakeup(&vm_pageproc_waiters); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the min threshold globally. */ void vm_wait_min(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_min()) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the severe threshold globally. */ void vm_wait_severe(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_severe()) { vm_severe_waiters++; msleep(&vm_severe_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } u_int vm_wait_count(void) { return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); } int vm_wait_doms(const domainset_t *wdoms, int mflags) { int error; error = 0; /* * We use racey wakeup synchronization to avoid expensive global * locking for the pageproc when sleeping with a non-specific vm_wait. * To handle this, we only sleep for one tick in this instance. It * is expected that most allocations for the pageproc will come from * kmem or vm_page_grab* which will use the more specific and * race-free vm_wait_domain(). */ if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); vm_pageproc_waiters++; error = msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP | mflags, "pageprocwait", 1); } else { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(wdoms)) { vm_min_waiters++; error = msleep(&vm_min_domains, &vm_domainset_lock, PVM | PDROP | mflags, "vmwait", 0); } else mtx_unlock(&vm_domainset_lock); } return (error); } /* * vm_wait_domain: * * Sleep until free pages are available for allocation. * - Called in various places after failed memory allocations. */ void vm_wait_domain(int domain) { struct vm_domain *vmd; domainset_t wdom; vmd = VM_DOMAIN(domain); vm_domain_free_assert_unlocked(vmd); if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) { vmd->vmd_pageout_pages_needed = 1; msleep(&vmd->vmd_pageout_pages_needed, &vm_domainset_lock, PDROP | PSWP, "VMWait", 0); } else mtx_unlock(&vm_domainset_lock); } else { if (pageproc == NULL) panic("vm_wait in early boot"); DOMAINSET_ZERO(&wdom); DOMAINSET_SET(vmd->vmd_domain, &wdom); vm_wait_doms(&wdom, 0); } } static int vm_wait_flags(vm_object_t obj, int mflags) { struct domainset *d; d = NULL; /* * Carefully fetch pointers only once: the struct domainset * itself is ummutable but the pointer might change. */ if (obj != NULL) d = obj->domain.dr_policy; if (d == NULL) d = curthread->td_domain.dr_policy; return (vm_wait_doms(&d->ds_mask, mflags)); } /* * vm_wait: * * Sleep until free pages are available for allocation in the * affinity domains of the obj. If obj is NULL, the domain set * for the calling thread is used. * Called in various places after failed memory allocations. */ void vm_wait(vm_object_t obj) { (void)vm_wait_flags(obj, 0); } int vm_wait_intr(vm_object_t obj) { return (vm_wait_flags(obj, PCATCH)); } /* * vm_domain_alloc_fail: * * Called when a page allocation function fails. Informs the * pagedaemon and performs the requested wait. Requires the * domain_free and object lock on entry. Returns with the * object lock held and free lock released. Returns an error when * retry is necessary. * */ static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) { vm_domain_free_assert_unlocked(vmd); atomic_add_int(&vmd->vmd_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_wait_domain(vmd->vmd_domain); if (object != NULL) VM_OBJECT_WLOCK(object); if (req & VM_ALLOC_WAITOK) return (EAGAIN); } return (0); } /* * vm_waitpfault: * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(struct domainset *dset, int timo) { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(&dset->ds_mask)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, "pfault", timo); } else mtx_unlock(&vm_domainset_lock); } static struct vm_pagequeue * _vm_page_pagequeue(vm_page_t m, uint8_t queue) { return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); } #ifdef INVARIANTS static struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { return (_vm_page_pagequeue(m, vm_page_astate_load(m).queue)); } #endif static __always_inline bool vm_page_pqstate_fcmpset(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { vm_page_astate_t tmp; tmp = *old; do { if (__predict_true(vm_page_astate_fcmpset(m, old, new))) return (true); counter_u64_add(pqstate_commit_retries, 1); } while (old->_bits == tmp._bits); return (false); } /* * Do the work of committing a queue state update that moves the page out of * its current queue. */ static bool _vm_page_pqstate_commit_dequeue(struct vm_pagequeue *pq, vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { vm_page_t next; vm_pagequeue_assert_locked(pq); KASSERT(vm_page_pagequeue(m) == pq, ("%s: queue %p does not match page %p", __func__, pq, m)); KASSERT(old->queue != PQ_NONE && new.queue != old->queue, ("%s: invalid queue indices %d %d", __func__, old->queue, new.queue)); /* * Once the queue index of the page changes there is nothing * synchronizing with further updates to the page's physical * queue state. Therefore we must speculatively remove the page * from the queue now and be prepared to roll back if the queue * state update fails. If the page is not physically enqueued then * we just update its queue index. */ if ((old->flags & PGA_ENQUEUED) != 0) { new.flags &= ~PGA_ENQUEUED; next = TAILQ_NEXT(m, plinks.q); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); if (!vm_page_pqstate_fcmpset(m, old, new)) { if (next == NULL) TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); else TAILQ_INSERT_BEFORE(next, m, plinks.q); vm_pagequeue_cnt_inc(pq); return (false); } else { return (true); } } else { return (vm_page_pqstate_fcmpset(m, old, new)); } } static bool vm_page_pqstate_commit_dequeue(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { struct vm_pagequeue *pq; vm_page_astate_t as; bool ret; pq = _vm_page_pagequeue(m, old->queue); /* * The queue field and PGA_ENQUEUED flag are stable only so long as the * corresponding page queue lock is held. */ vm_pagequeue_lock(pq); as = vm_page_astate_load(m); if (__predict_false(as._bits != old->_bits)) { *old = as; ret = false; } else { ret = _vm_page_pqstate_commit_dequeue(pq, m, old, new); } vm_pagequeue_unlock(pq); return (ret); } /* * Commit a queue state update that enqueues or requeues a page. */ static bool _vm_page_pqstate_commit_requeue(struct vm_pagequeue *pq, vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { struct vm_domain *vmd; vm_pagequeue_assert_locked(pq); KASSERT(old->queue != PQ_NONE && new.queue == old->queue, ("%s: invalid queue indices %d %d", __func__, old->queue, new.queue)); new.flags |= PGA_ENQUEUED; if (!vm_page_pqstate_fcmpset(m, old, new)) return (false); if ((old->flags & PGA_ENQUEUED) != 0) TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); else vm_pagequeue_cnt_inc(pq); /* * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In particular, if * both flags are set in close succession, only PGA_REQUEUE_HEAD will be * applied, even if it was set first. */ if ((old->flags & PGA_REQUEUE_HEAD) != 0) { vmd = vm_pagequeue_domain(m); KASSERT(pq == &vmd->vmd_pagequeues[PQ_INACTIVE], ("%s: invalid page queue for page %p", __func__, m)); TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); } else { TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } return (true); } /* * Commit a queue state update that encodes a request for a deferred queue * operation. */ static bool vm_page_pqstate_commit_request(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { KASSERT(old->queue == new.queue || new.queue != PQ_NONE, ("%s: invalid state, queue %d flags %x", __func__, new.queue, new.flags)); if (old->_bits != new._bits && !vm_page_pqstate_fcmpset(m, old, new)) return (false); vm_page_pqbatch_submit(m, new.queue); return (true); } /* * A generic queue state update function. This handles more cases than the * specialized functions above. */ bool vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { if (old->_bits == new._bits) return (true); if (old->queue != PQ_NONE && new.queue != old->queue) { if (!vm_page_pqstate_commit_dequeue(m, old, new)) return (false); if (new.queue != PQ_NONE) vm_page_pqbatch_submit(m, new.queue); } else { if (!vm_page_pqstate_fcmpset(m, old, new)) return (false); if (new.queue != PQ_NONE && ((new.flags & ~old->flags) & PGA_QUEUE_OP_MASK) != 0) vm_page_pqbatch_submit(m, new.queue); } return (true); } /* * Apply deferred queue state updates to a page. */ static inline void vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue) { vm_page_astate_t new, old; CRITICAL_ASSERT(curthread); vm_pagequeue_assert_locked(pq); KASSERT(queue < PQ_COUNT, ("%s: invalid queue index %d", __func__, queue)); KASSERT(pq == _vm_page_pagequeue(m, queue), ("%s: page %p does not belong to queue %p", __func__, m, pq)); for (old = vm_page_astate_load(m);;) { if (__predict_false(old.queue != queue || (old.flags & PGA_QUEUE_OP_MASK) == 0)) { counter_u64_add(queue_nops, 1); break; } KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is unmanaged", __func__, m)); new = old; if ((old.flags & PGA_DEQUEUE) != 0) { new.flags &= ~PGA_QUEUE_OP_MASK; new.queue = PQ_NONE; if (__predict_true(_vm_page_pqstate_commit_dequeue(pq, m, &old, new))) { counter_u64_add(queue_ops, 1); break; } } else { new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); if (__predict_true(_vm_page_pqstate_commit_requeue(pq, m, &old, new))) { counter_u64_add(queue_ops, 1); break; } } } } static void vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, uint8_t queue) { int i; for (i = 0; i < bq->bq_cnt; i++) vm_pqbatch_process_page(pq, bq->bq_pa[i], queue); vm_batchqueue_init(bq); } /* * vm_page_pqbatch_submit: [ internal use only ] * * Enqueue a page in the specified page queue's batched work queue. * The caller must have encoded the requested operation in the page * structure's a.flags field. */ void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) { struct vm_batchqueue *bq; struct vm_pagequeue *pq; int domain; KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); domain = vm_page_domain(m); critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); if (vm_batchqueue_insert(bq, m)) { critical_exit(); return; } critical_exit(); pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); vm_pqbatch_process(pq, bq, queue); vm_pqbatch_process_page(pq, m, queue); vm_pagequeue_unlock(pq); critical_exit(); } /* * vm_page_pqbatch_drain: [ internal use only ] * * Force all per-CPU page queue batch queues to be drained. This is * intended for use in severe memory shortages, to ensure that pages * do not remain stuck in the batch queues. */ void vm_page_pqbatch_drain(void) { struct thread *td; struct vm_domain *vmd; struct vm_pagequeue *pq; int cpu, domain, queue; td = curthread; CPU_FOREACH(cpu) { thread_lock(td); sched_bind(td, cpu); thread_unlock(td); for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); for (queue = 0; queue < PQ_COUNT; queue++) { pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); critical_enter(); vm_pqbatch_process(pq, DPCPU_PTR(pqbatch[domain][queue]), queue); critical_exit(); vm_pagequeue_unlock(pq); } } } thread_lock(td); sched_unbind(td); thread_unlock(td); } /* * vm_page_dequeue_deferred: [ internal use only ] * * Request removal of the given page from its current page * queue. Physical removal from the queue may be deferred * indefinitely. */ void vm_page_dequeue_deferred(vm_page_t m) { vm_page_astate_t new, old; old = vm_page_astate_load(m); do { if (old.queue == PQ_NONE) { KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p has unexpected queue state", __func__, m)); break; } new = old; new.flags |= PGA_DEQUEUE; } while (!vm_page_pqstate_commit_request(m, &old, new)); } /* * vm_page_dequeue: * * Remove the page from whichever page queue it's in, if any, before * returning. */ void vm_page_dequeue(vm_page_t m) { vm_page_astate_t new, old; old = vm_page_astate_load(m); do { if (old.queue == PQ_NONE) { KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p has unexpected queue state", __func__, m)); break; } new = old; new.flags &= ~PGA_QUEUE_OP_MASK; new.queue = PQ_NONE; } while (!vm_page_pqstate_commit_dequeue(m, &old, new)); } /* * Schedule the given page for insertion into the specified page queue. * Physical insertion of the page may be deferred indefinitely. */ static void vm_page_enqueue(vm_page_t m, uint8_t queue) { KASSERT(m->a.queue == PQ_NONE && (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p is already enqueued", __func__, m)); KASSERT(m->ref_count > 0, ("%s: page %p does not carry any references", __func__, m)); m->a.queue = queue; if ((m->a.flags & PGA_REQUEUE) == 0) vm_page_aflag_set(m, PGA_REQUEUE); vm_page_pqbatch_submit(m, queue); } /* * vm_page_free_prep: * * Prepares the given page to be put on the free list, * disassociating it from any VM object. The caller may return * the page to the free list only if this function returns true. * * The object, if it exists, must be locked, and then the page must * be xbusy. Otherwise the page must be not busied. A managed * page must be unmapped. */ static bool vm_page_free_prep(vm_page_t m) { /* * Synchronize with threads that have dropped a reference to this * page. */ atomic_thread_fence_acq(); #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) { uint64_t *p; int i; p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", m, i, (uintmax_t)*p)); } #endif if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_prep: freeing mapped page %p", m)); KASSERT((m->a.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, ("vm_page_free_prep: mapping flags set in page %p", m)); } else { KASSERT(m->a.queue == PQ_NONE, ("vm_page_free_prep: unmanaged page %p is queued", m)); } VM_CNT_INC(v_tfree); if (m->object != NULL) { KASSERT(((m->oflags & VPO_UNMANAGED) != 0) == ((m->object->flags & OBJ_UNMANAGED) != 0), ("vm_page_free_prep: managed flag mismatch for page %p", m)); vm_page_assert_xbusied(m); /* * The object reference can be released without an atomic * operation. */ KASSERT((m->flags & PG_FICTITIOUS) != 0 || m->ref_count == VPRC_OBJREF, ("vm_page_free_prep: page %p has unexpected ref_count %u", m, m->ref_count)); vm_page_object_remove(m); m->ref_count -= VPRC_OBJREF; } else vm_page_assert_unbusied(m); vm_page_busy_free(m); /* * If fictitious remove object association and * return. */ if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->ref_count == 1, ("fictitious page %p is referenced", m)); KASSERT(m->a.queue == PQ_NONE, ("fictitious page %p is queued", m)); return (false); } /* * Pages need not be dequeued before they are returned to the physical * memory allocator, but they must at least be marked for a deferred * dequeue. */ if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_dequeue_deferred(m); m->valid = 0; vm_page_undirty(m); if (m->ref_count != 0) panic("vm_page_free_prep: page %p has references", m); /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); #if VM_NRESERVLEVEL > 0 /* * Determine whether the page belongs to a reservation. If the page was * allocated from a per-CPU cache, it cannot belong to a reservation, so * as an optimization, we avoid the check in that case. */ if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m)) return (false); #endif return (true); } /* * vm_page_free_toq: * * Returns the given page to the free list, disassociating it * from any VM object. * * The object must be locked. The page must be exclusively busied if it * belongs to an object. */ static void vm_page_free_toq(vm_page_t m) { struct vm_domain *vmd; uma_zone_t zone; if (!vm_page_free_prep(m)) return; vmd = vm_pagequeue_domain(m); zone = vmd->vmd_pgcache[m->pool].zone; if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) { uma_zfree(zone, m); return; } vm_domain_free_lock(vmd); vm_phys_free_pages(m, 0); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, 1); } /* * vm_page_free_pages_toq: * * Returns a list of pages to the free list, disassociating it * from any VM object. In other words, this is equivalent to * calling vm_page_free_toq() for each page of a list of VM objects. */ void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count) { vm_page_t m; int count; if (SLIST_EMPTY(free)) return; count = 0; while ((m = SLIST_FIRST(free)) != NULL) { count++; SLIST_REMOVE_HEAD(free, plinks.s.ss); vm_page_free_toq(m); } if (update_wire_count) vm_wire_sub(count); } /* * Mark this page as wired down. For managed pages, this prevents reclamation * by the page daemon, or when the containing object, if any, is destroyed. */ void vm_page_wire(vm_page_t m) { u_int old; #ifdef INVARIANTS if (m->object != NULL && !vm_page_busied(m) && !vm_object_busied(m->object)) VM_OBJECT_ASSERT_LOCKED(m->object); #endif KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(m->ref_count) >= 1, ("vm_page_wire: fictitious page %p has zero wirings", m)); old = atomic_fetchadd_int(&m->ref_count, 1); KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX, ("vm_page_wire: counter overflow for page %p", m)); if (VPRC_WIRE_COUNT(old) == 0) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); vm_wire_add(1); } } /* * Attempt to wire a mapped page following a pmap lookup of that page. * This may fail if a thread is concurrently tearing down mappings of the page. * The transient failure is acceptable because it translates to the * failure of the caller pmap_extract_and_hold(), which should be then * followed by the vm_fault() fallback, see e.g. vm_fault_quick_hold_pages(). */ bool vm_page_wire_mapped(vm_page_t m) { u_int old; old = m->ref_count; do { KASSERT(old > 0, ("vm_page_wire_mapped: wiring unreferenced page %p", m)); if ((old & VPRC_BLOCKED) != 0) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1)); if (VPRC_WIRE_COUNT(old) == 0) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); vm_wire_add(1); } return (true); } /* * Release a wiring reference to a managed page. If the page still belongs to * an object, update its position in the page queues to reflect the reference. * If the wiring was the last reference to the page, free the page. */ static void vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse) { u_int old; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is unmanaged", __func__, m)); /* * Update LRU state before releasing the wiring reference. * Use a release store when updating the reference count to * synchronize with vm_page_free_prep(). */ old = m->ref_count; do { KASSERT(VPRC_WIRE_COUNT(old) > 0, ("vm_page_unwire: wire count underflow for page %p", m)); if (old > VPRC_OBJREF + 1) { /* * The page has at least one other wiring reference. An * earlier iteration of this loop may have called * vm_page_release_toq() and cleared PGA_DEQUEUE, so * re-set it if necessary. */ if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); } else if (old == VPRC_OBJREF + 1) { /* * This is the last wiring. Clear PGA_DEQUEUE and * update the page's queue state to reflect the * reference. If the page does not belong to an object * (i.e., the VPRC_OBJREF bit is clear), we only need to * clear leftover queue state. */ vm_page_release_toq(m, nqueue, noreuse); } else if (old == 1) { vm_page_aflag_clear(m, PGA_DEQUEUE); } } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); if (VPRC_WIRE_COUNT(old) == 1) { vm_wire_sub(1); if (old == 1) vm_page_free(m); } } /* * Release one wiring of the specified page, potentially allowing it to be * paged out. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue. If the released wiring * represented the last reference to the page, the page is freed. */ void vm_page_unwire(vm_page_t m, uint8_t nqueue) { KASSERT(nqueue < PQ_COUNT, ("vm_page_unwire: invalid queue %u request for page %p", nqueue, m)); if ((m->oflags & VPO_UNMANAGED) != 0) { if (vm_page_unwire_noq(m) && m->ref_count == 0) vm_page_free(m); return; } vm_page_unwire_managed(m, nqueue, false); } /* * Unwire a page without (re-)inserting it into a page queue. It is up * to the caller to enqueue, requeue, or free the page as appropriate. * In most cases involving managed pages, vm_page_unwire() should be used * instead. */ bool vm_page_unwire_noq(vm_page_t m) { u_int old; old = vm_page_drop(m, 1); KASSERT(VPRC_WIRE_COUNT(old) != 0, ("%s: counter underflow for page %p", __func__, m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1, ("%s: missing ref on fictitious page %p", __func__, m)); if (VPRC_WIRE_COUNT(old) > 1) return (false); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_clear(m, PGA_DEQUEUE); vm_wire_sub(1); return (true); } /* * Ensure that the page ends up in the specified page queue. If the page is * active or being moved to the active queue, ensure that its act_count is * at least ACT_INIT but do not otherwise mess with it. */ static __always_inline void vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag) { vm_page_astate_t old, new; KASSERT(m->ref_count > 0, ("%s: page %p does not carry any references", __func__, m)); KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD, ("%s: invalid flags %x", __func__, nflag)); if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; old = vm_page_astate_load(m); do { if ((old.flags & PGA_DEQUEUE) != 0) break; new = old; new.flags &= ~PGA_QUEUE_OP_MASK; if (nqueue == PQ_ACTIVE) new.act_count = max(old.act_count, ACT_INIT); if (old.queue == nqueue) { if (nqueue != PQ_ACTIVE) new.flags |= nflag; } else { new.flags |= nflag; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); } /* * Put the specified page on the active list (if appropriate). */ void vm_page_activate(vm_page_t m) { vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE); } /* * Move the specified page to the tail of the inactive queue, or requeue * the page if it is already in the inactive queue. */ void vm_page_deactivate(vm_page_t m) { vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE); } void vm_page_deactivate_noreuse(vm_page_t m) { vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD); } /* * Put a page in the laundry, or requeue it if it is already there. */ void vm_page_launder(vm_page_t m) { vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE); } /* * Put a page in the PQ_UNSWAPPABLE holding queue. */ void vm_page_unswappable(vm_page_t m) { KASSERT(!vm_page_wired(m) && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); vm_page_dequeue(m); vm_page_enqueue(m, PQ_UNSWAPPABLE); } /* * Release a page back to the page queues in preparation for unwiring. */ static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse) { vm_page_astate_t old, new; uint16_t nflag; /* * Use a check of the valid bits to determine whether we should * accelerate reclamation of the page. The object lock might not be * held here, in which case the check is racy. At worst we will either * accelerate reclamation of a valid page and violate LRU, or * unnecessarily defer reclamation of an invalid page. * * If we were asked to not cache the page, place it near the head of the * inactive queue so that is reclaimed sooner. */ if (noreuse || m->valid == 0) { nqueue = PQ_INACTIVE; nflag = PGA_REQUEUE_HEAD; } else { nflag = PGA_REQUEUE; } old = vm_page_astate_load(m); do { new = old; /* * If the page is already in the active queue and we are not * trying to accelerate reclamation, simply mark it as * referenced and avoid any queue operations. */ new.flags &= ~PGA_QUEUE_OP_MASK; if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE) new.flags |= PGA_REFERENCED; else { new.flags |= nflag; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); } /* * Unwire a page and either attempt to free it or re-add it to the page queues. */ void vm_page_release(vm_page_t m, int flags) { vm_object_t object; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release: page %p is unmanaged", m)); if ((flags & VPR_TRYFREE) != 0) { for (;;) { object = atomic_load_ptr(&m->object); if (object == NULL) break; /* Depends on type-stability. */ if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object)) break; if (object == m->object) { vm_page_release_locked(m, flags); VM_OBJECT_WUNLOCK(object); return; } VM_OBJECT_WUNLOCK(object); } } vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0); } /* See vm_page_release(). */ void vm_page_release_locked(vm_page_t m, int flags) { VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release_locked: page %p is unmanaged", m)); if (vm_page_unwire_noq(m)) { if ((flags & VPR_TRYFREE) != 0 && (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) && m->dirty == 0 && vm_page_tryxbusy(m)) { /* * An unlocked lookup may have wired the page before the * busy lock was acquired, in which case the page must * not be freed. */ if (__predict_true(!vm_page_wired(m))) { vm_page_free(m); return; } vm_page_xunbusy(m); } else { vm_page_release_toq(m, PQ_INACTIVE, flags != 0); } } } static bool vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t)) { u_int old; KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0, ("vm_page_try_blocked_op: page %p has no object", m)); KASSERT(vm_page_busied(m), ("vm_page_try_blocked_op: page %p is not busy", m)); VM_OBJECT_ASSERT_LOCKED(m->object); old = m->ref_count; do { KASSERT(old != 0, ("vm_page_try_blocked_op: page %p has no references", m)); if (VPRC_WIRE_COUNT(old) != 0) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED)); (op)(m); /* * If the object is read-locked, new wirings may be created via an * object lookup. */ old = vm_page_drop(m, VPRC_BLOCKED); KASSERT(!VM_OBJECT_WOWNED(m->object) || old == (VPRC_BLOCKED | VPRC_OBJREF), ("vm_page_try_blocked_op: unexpected refcount value %u for %p", old, m)); return (true); } /* * Atomically check for wirings and remove all mappings of the page. */ bool vm_page_try_remove_all(vm_page_t m) { return (vm_page_try_blocked_op(m, pmap_remove_all)); } /* * Atomically check for wirings and remove all writeable mappings of the page. */ bool vm_page_try_remove_write(vm_page_t m) { return (vm_page_try_blocked_op(m, pmap_remove_write)); } /* * vm_page_advise * * Apply the specified advice to the given page. */ void vm_page_advise(vm_page_t m, int advice) { VM_OBJECT_ASSERT_WLOCKED(m->object); vm_page_assert_xbusied(m); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * without first paging it out. MADV_FREE pages are often * quickly reused by malloc(3), so we do not do anything that * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) { if (advice == MADV_WILLNEED) vm_page_activate(m); return; } if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that * the page will be reused quickly. Dirty pages not already in the * laundry are moved there. */ if (m->dirty == 0) vm_page_deactivate_noreuse(m); else if (!vm_page_in_laundry(m)) vm_page_launder(m); } /* * vm_page_grab_release * * Helper routine for grab functions to release busy on return. */ static inline void vm_page_grab_release(vm_page_t m, int allocflags) { if ((allocflags & VM_ALLOC_NOBUSY) != 0) { if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) vm_page_sunbusy(m); else vm_page_xunbusy(m); } } /* * vm_page_grab_sleep * * Sleep for busy according to VM_ALLOC_ parameters. Returns true * if the caller should retry and false otherwise. * * If the object is locked on entry the object will be unlocked with * false returns and still locked but possibly having been dropped * with true returns. */ static bool vm_page_grab_sleep(vm_object_t object, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (false); /* * Reference the page before unlocking and sleeping so that * the page daemon is less likely to reclaim it. */ if (locked && (allocflags & VM_ALLOC_NOCREAT) == 0) vm_page_reference(m); if (_vm_page_busy_sleep(object, m, pindex, wmesg, allocflags, locked) && locked) VM_OBJECT_WLOCK(object); if ((allocflags & VM_ALLOC_WAITFAIL) != 0) return (false); return (true); } /* * Assert that the grab flags are valid. */ static inline void vm_page_grab_check(int allocflags) { KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || (allocflags & VM_ALLOC_WIRED) != 0, ("vm_page_grab*: the pages must be busied or wired")); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab*: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); } /* * Calculate the page allocation flags for grab. */ static inline int vm_page_grab_pflags(int allocflags) { int pflags; pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL | VM_ALLOC_NOBUSY); if ((allocflags & VM_ALLOC_NOWAIT) == 0) pflags |= VM_ALLOC_WAITFAIL; if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) pflags |= VM_ALLOC_SBUSY; return (pflags); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_grab_check(allocflags); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { if (!vm_page_tryacquire(m, allocflags)) { if (vm_page_grab_sleep(object, m, pindex, "pgrbwt", allocflags, true)) goto retrylookup; return (NULL); } goto out; } if ((allocflags & VM_ALLOC_NOCREAT) != 0) return (NULL); m = vm_page_alloc(object, pindex, vm_page_grab_pflags(allocflags)); if (m == NULL) { if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) return (NULL); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); out: vm_page_grab_release(m, allocflags); return (m); } /* * Locklessly attempt to acquire a page given a (object, pindex) tuple * and an optional previous page to avoid the radix lookup. The resulting * page will be validated against the identity tuple and busied or wired * as requested. A NULL *mp return guarantees that the page was not in * radix at the time of the call but callers must perform higher level * synchronization or retry the operation under a lock if they require * an atomic answer. This is the only lock free validation routine, * other routines can depend on the resulting page state. * * The return value indicates whether the operation failed due to caller * flags. The return is tri-state with mp: * * (true, *mp != NULL) - The operation was successful. * (true, *mp == NULL) - The page was not found in tree. * (false, *mp == NULL) - WAITFAIL or NOWAIT prevented acquisition. */ static bool vm_page_acquire_unlocked(vm_object_t object, vm_pindex_t pindex, vm_page_t prev, vm_page_t *mp, int allocflags) { vm_page_t m; vm_page_grab_check(allocflags); MPASS(prev == NULL || vm_page_busied(prev) || vm_page_wired(prev)); *mp = NULL; for (;;) { /* * We may see a false NULL here because the previous page * has been removed or just inserted and the list is loaded * without barriers. Switch to radix to verify. */ if (prev == NULL || (m = TAILQ_NEXT(prev, listq)) == NULL || QMD_IS_TRASHED(m) || m->pindex != pindex || atomic_load_ptr(&m->object) != object) { prev = NULL; /* * This guarantees the result is instantaneously * correct. */ m = vm_radix_lookup_unlocked(&object->rtree, pindex); } if (m == NULL) return (true); if (vm_page_trybusy(m, allocflags)) { if (m->object == object && m->pindex == pindex) break; /* relookup. */ vm_page_busy_release(m); cpu_spinwait(); continue; } if (!vm_page_grab_sleep(object, m, pindex, "pgnslp", allocflags, false)) return (false); } if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); vm_page_grab_release(m, allocflags); *mp = m; return (true); } /* * Try to locklessly grab a page and fall back to the object lock if NOCREAT * is not set. */ vm_page_t vm_page_grab_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; vm_page_grab_check(allocflags); if (!vm_page_acquire_unlocked(object, pindex, NULL, &m, allocflags)) return (NULL); if (m != NULL) return (m); /* * The radix lockless lookup should never return a false negative * errors. If the user specifies NOCREAT they are guaranteed there * was no page present at the instant of the call. A NOCREAT caller * must handle create races gracefully. */ if ((allocflags & VM_ALLOC_NOCREAT) != 0) return (NULL); VM_OBJECT_WLOCK(object); m = vm_page_grab(object, pindex, allocflags); VM_OBJECT_WUNLOCK(object); return (m); } /* * Grab a page and make it valid, paging in if necessary. Pages missing from * their pager are zero filled and validated. If a VM_ALLOC_COUNT is supplied * and the page is not valid as many as VM_INITIAL_PAGEIN pages can be brought * in simultaneously. Additional pages will be left on a paging queue but * will neither be wired nor busy regardless of allocflags. */ int vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; vm_page_t ma[VM_INITIAL_PAGEIN]; int after, i, pflags, rv; KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); KASSERT((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, ("vm_page_grab_valid: Invalid flags 0x%X", allocflags)); VM_OBJECT_ASSERT_WLOCKED(object); pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY | VM_ALLOC_WIRED); pflags |= VM_ALLOC_WAITFAIL; retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { /* * If the page is fully valid it can only become invalid * with the object lock held. If it is not valid it can * become valid with the busy lock held. Therefore, we * may unnecessarily lock the exclusive busy here if we * race with I/O completion not using the object lock. * However, we will not end up with an invalid page and a * shared lock. */ if (!vm_page_trybusy(m, vm_page_all_valid(m) ? allocflags : 0)) { (void)vm_page_grab_sleep(object, m, pindex, "pgrbwt", allocflags, true); goto retrylookup; } if (vm_page_all_valid(m)) goto out; if ((allocflags & VM_ALLOC_NOCREAT) != 0) { vm_page_busy_release(m); *mp = NULL; return (VM_PAGER_FAIL); } } else if ((allocflags & VM_ALLOC_NOCREAT) != 0) { *mp = NULL; return (VM_PAGER_FAIL); } else if ((m = vm_page_alloc(object, pindex, pflags)) == NULL) { goto retrylookup; } vm_page_assert_xbusied(m); if (vm_pager_has_page(object, pindex, NULL, &after)) { after = MIN(after, VM_INITIAL_PAGEIN); after = MIN(after, allocflags >> VM_ALLOC_COUNT_SHIFT); after = MAX(after, 1); ma[0] = m; for (i = 1; i < after; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid || !vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, m->pindex + i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } after = i; vm_object_pip_add(object, after); VM_OBJECT_WUNLOCK(object); rv = vm_pager_get_pages(object, ma, after, NULL, NULL); VM_OBJECT_WLOCK(object); vm_object_pip_wakeupn(object, after); /* Pager may have replaced a page. */ m = ma[0]; if (rv != VM_PAGER_OK) { for (i = 0; i < after; i++) { if (!vm_page_wired(ma[i])) vm_page_free(ma[i]); else vm_page_xunbusy(ma[i]); } *mp = NULL; return (rv); } for (i = 1; i < after; i++) vm_page_readahead_finish(ma[i]); MPASS(vm_page_all_valid(m)); } else { vm_page_zero_invalid(m, TRUE); } out: if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); if ((allocflags & VM_ALLOC_SBUSY) != 0 && vm_page_xbusied(m)) vm_page_busy_downgrade(m); else if ((allocflags & VM_ALLOC_NOBUSY) != 0) vm_page_busy_release(m); *mp = m; return (VM_PAGER_OK); } /* * Locklessly grab a valid page. If the page is not valid or not yet * allocated this will fall back to the object lock method. */ int vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int flags; int error; KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_valid_unlocked: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY " "mismatch")); KASSERT((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, ("vm_page_grab_valid_unlocked: Invalid flags 0x%X", allocflags)); /* * Attempt a lockless lookup and busy. We need at least an sbusy * before we can inspect the valid field and return a wired page. */ flags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); if (!vm_page_acquire_unlocked(object, pindex, NULL, mp, flags)) return (VM_PAGER_FAIL); if ((m = *mp) != NULL) { if (vm_page_all_valid(m)) { if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); vm_page_grab_release(m, allocflags); return (VM_PAGER_OK); } vm_page_busy_release(m); } if ((allocflags & VM_ALLOC_NOCREAT) != 0) { *mp = NULL; return (VM_PAGER_FAIL); } VM_OBJECT_WLOCK(object); error = vm_page_grab_valid(mp, object, pindex, allocflags); VM_OBJECT_WUNLOCK(object); return (error); } /* * Return the specified range of pages from the given object. For each * page offset within the range, if a page already exists within the object * at that offset and it is busy, then wait for it to change state. If, * instead, the page doesn't exist, then allocate it. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs the pages * * The caller must always specify that the pages are to be busied and/or * wired. * * optional allocation flags: * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NOWAIT do not sleep * VM_ALLOC_SBUSY set page to sbusy state * VM_ALLOC_WIRED wire the pages * VM_ALLOC_ZERO zero and validate any invalid pages * * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it * may return a partial prefix of the requested range. */ int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, mpred; int pflags; int i; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); KASSERT(count > 0, ("vm_page_grab_pages: invalid page count %d", count)); vm_page_grab_check(allocflags); pflags = vm_page_grab_pflags(allocflags); i = 0; retrylookup: m = vm_radix_lookup_le(&object->rtree, pindex + i); if (m == NULL || m->pindex != pindex + i) { mpred = m; m = NULL; } else mpred = TAILQ_PREV(m, pglist, listq); for (; i < count; i++) { if (m != NULL) { if (!vm_page_tryacquire(m, allocflags)) { if (vm_page_grab_sleep(object, m, pindex + i, "grbmaw", allocflags, true)) goto retrylookup; break; } } else { if ((allocflags & VM_ALLOC_NOCREAT) != 0) break; m = vm_page_alloc_after(object, pindex + i, pflags | VM_ALLOC_COUNT(count - i), mpred); if (m == NULL) { if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) break; goto retrylookup; } } if (vm_page_none_valid(m) && (allocflags & VM_ALLOC_ZERO) != 0) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); } vm_page_grab_release(m, allocflags); ma[i] = mpred = m; m = vm_page_next(m); } return (i); } /* * Unlocked variant of vm_page_grab_pages(). This accepts the same flags * and will fall back to the locked variant to handle allocation. */ int vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, pred; int flags; int i; KASSERT(count > 0, ("vm_page_grab_pages_unlocked: invalid page count %d", count)); vm_page_grab_check(allocflags); /* * Modify flags for lockless acquire to hold the page until we * set it valid if necessary. */ flags = allocflags & ~VM_ALLOC_NOBUSY; pred = NULL; for (i = 0; i < count; i++, pindex++) { if (!vm_page_acquire_unlocked(object, pindex, pred, &m, flags)) return (i); if (m == NULL) break; if ((flags & VM_ALLOC_ZERO) != 0 && vm_page_none_valid(m)) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); } /* m will still be wired or busy according to flags. */ vm_page_grab_release(m, allocflags); pred = ma[i] = m; } if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0) return (i); count -= i; VM_OBJECT_WLOCK(object); i += vm_page_grab_pages(object, pindex, allocflags, &ma[i], count); VM_OBJECT_WUNLOCK(object); return (i); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } void vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set) { #if PAGE_SIZE == 32768 atomic_set_64((uint64_t *)bits, set); #elif PAGE_SIZE == 16384 atomic_set_32((uint32_t *)bits, set); #elif (PAGE_SIZE == 8192) && defined(atomic_set_16) atomic_set_16((uint16_t *)bits, set); #elif (PAGE_SIZE == 4096) && defined(atomic_set_8) atomic_set_8((uint8_t *)bits, set); #else /* PAGE_SIZE <= 8192 */ uintptr_t addr; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_set_32((uint32_t *)addr, set << shift); #endif /* PAGE_SIZE */ } static inline void vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear) { #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)bits, clear); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)bits, clear); #elif (PAGE_SIZE == 8192) && defined(atomic_clear_16) atomic_clear_16((uint16_t *)bits, clear); #elif (PAGE_SIZE == 4096) && defined(atomic_clear_8) atomic_clear_8((uint8_t *)bits, clear); #else /* PAGE_SIZE <= 8192 */ uintptr_t addr; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, clear << shift); #endif /* PAGE_SIZE */ } static inline vm_page_bits_t vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits) { #if PAGE_SIZE == 32768 uint64_t old; old = *bits; while (atomic_fcmpset_64(bits, &old, newbits) == 0); return (old); #elif PAGE_SIZE == 16384 uint32_t old; old = *bits; while (atomic_fcmpset_32(bits, &old, newbits) == 0); return (old); #elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16) uint16_t old; old = *bits; while (atomic_fcmpset_16(bits, &old, newbits) == 0); return (old); #elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8) uint8_t old; old = *bits; while (atomic_fcmpset_8(bits, &old, newbits) == 0); return (old); #else /* PAGE_SIZE <= 4096*/ uintptr_t addr; uint32_t old, new, mask; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, swap, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); mask = VM_PAGE_BITS_ALL << shift; old = *bits; do { new = old & ~mask; new |= newbits << shift; } while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0); return (old >> shift); #endif /* PAGE_SIZE */ } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; vm_page_bits_t pagebits; vm_page_assert_busied(m); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ pagebits = vm_page_bits(base, size); if (vm_page_xbusied(m)) m->valid |= pagebits; else vm_page_bits_set(m, &m->valid, pagebits); } /* * Set the page dirty bits and free the invalid swap space if * present. Returns the previous dirty bits. */ vm_page_bits_t vm_page_set_dirty(vm_page_t m) { vm_page_bits_t old; VM_PAGE_OBJECT_BUSY_ASSERT(m); if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) { old = m->dirty; m->dirty = VM_PAGE_BITS_ALL; } else old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL); if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0) vm_pager_page_unswapped(m); return (old); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { vm_page_assert_busied(m); /* * If the page is xbusied and not write mapped we are the * only thread that can modify dirty bits. Otherwise, The pmap * layer can call vm_page_dirty() without holding a distinguished * lock. The combination of page busy and atomic operations * suffice to guarantee consistency of the page dirty field. */ if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else vm_page_bits_clear(m, &m->dirty, pagebits); } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; vm_page_assert_busied(m); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the PGA_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); if (vm_page_xbusied(m)) m->valid |= pagebits; else vm_page_bits_set(m, &m->valid, pagebits); #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; vm_page_aflag_clear(m, PGA_NOSYNC); } else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m)) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; /* * The object lock is required so that pages can't be mapped * read-only while we're in the process of invalidating them. */ object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_assert_busied(m); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && vm_page_all_valid(m)) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); if (vm_page_xbusied(m)) { m->valid &= ~bits; m->dirty &= ~bits; } else { vm_page_bits_clear(m, &m->valid, bits); vm_page_bits_clear(m, &m->dirty, bits); } } /* * vm_page_invalid: * * Invalidates the entire page. The page must be busy, unmapped, and * the enclosing object must be locked. The object locks protects * against concurrent read-only pmap enter which is done without * busy. */ void vm_page_invalid(vm_page_t m) { vm_page_assert_busied(m); VM_OBJECT_ASSERT_LOCKED(m->object); MPASS(!pmap_page_is_mapped(m)); if (vm_page_xbusied(m)) m->valid = 0; else vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL); } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) vm_page_valid(m); } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. * * Some callers envoke this routine without the busy lock held and * handle races via higher level locks. Typical callers should * hold a busy lock to prevent invalidation. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * Returns true if all of the specified predicates are true for the entire * (super)page and false otherwise. */ bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) { vm_object_t object; int i, npages; object = m->object; if (skip_m != NULL && skip_m->object != object) return (false); VM_OBJECT_ASSERT_LOCKED(object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { /* Always test object consistency, including "skip_m". */ if (m[i].object != object) return (false); if (&m[i] == skip_m) continue; if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) return (false); if ((flags & PS_ALL_DIRTY) != 0) { /* * Calling vm_page_test_dirty() or pmap_is_modified() * might stop this case from spuriously returning * "false". However, that would require a write lock * on the object containing "m[i]". */ if (m[i].dirty != VM_PAGE_BITS_ALL) return (false); } if ((flags & PS_ALL_VALID) != 0 && m[i].valid != VM_PAGE_BITS_ALL) return (false); } return (true); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { vm_page_assert_busied(m); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_valid(vm_page_t m) { vm_page_assert_busied(m); if (vm_page_xbusied(m)) m->valid = VM_PAGE_BITS_ALL; else vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_busy_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of a page or object busy. */ if (m->object != NULL && !vm_page_busied(m)) VM_OBJECT_ASSERT_BUSY(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_BUSY(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count()); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d\n", vm_free_count()); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys, virt; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; virt = strchr(modif, 'v') != NULL; if (virt) m = PHYS_TO_VM_PAGE(pmap_kextract(addr)); else if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->a.queue, m->ref_count, m->a.flags, m->oflags, m->flags, m->a.act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */