Index: projects/amd64_xen_pv/sys/amd64/xen/pmap.c =================================================================== --- projects/amd64_xen_pv/sys/amd64/xen/pmap.c (revision 249853) +++ projects/amd64_xen_pv/sys/amd64/xen/pmap.c (revision 249854) @@ -1,2371 +1,2436 @@ /*- * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Alan L. Cox * All rights reserved. * Copyright (c) 2012 Citrix Systems * All rights reserved. * Copyright (c) 2012, 2013 Spectra Logic Corporation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Portions of this software were developed by * Cherry G. Mathew under sponsorship * from Spectra Logic Corporation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern vm_offset_t pa_index; /* from machdep.c */ extern unsigned long physfree; /* from machdep.c */ struct pmap kernel_pmap_store; #define ISDMAPVA(va) ((va) >= DMAP_MIN_ADDRESS && \ (va) <= DMAP_MAX_ADDRESS) #define ISKERNELVA(va) ((va) >= VM_MIN_KERNEL_ADDRESS && \ (va) <= VM_MAX_KERNEL_ADDRESS) #define ISBOOTVA(va) ((va) >= KERNBASE && (va) <= virtual_avail) /* XXX: keep an eye on virtual_avail */ uintptr_t virtual_avail; /* VA of first avail page (after kernel bss) */ uintptr_t virtual_end; /* VA of last avail page (end of kernel AS) */ int nkpt; /* * VA for temp mapping to zero. * We need this because on xen, the DMAP is R/O */ const uintptr_t zerova = VM_MAX_KERNEL_ADDRESS; #define DMAP4KSUPPORT /* Temporary 4K based DMAP support */ #ifdef DMAPSUPPORT static int ndmpdp; static vm_paddr_t dmaplimit; #endif uintptr_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; pt_entry_t pg_nx = 0; /* XXX: probe for this ? */ struct msgbuf *msgbufp = 0; static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ #if defined(DMAPSUPPORT) || defined(DMAP4KSUPPORT) #ifdef DMAP4KSUPPORT static u_int64_t DMPTphys; /* phys addr of direct mapped level 1 */ #endif static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ #endif /* DMAPSUPPORT || DMAP4KSUPPORT */ static vm_paddr_t boot_ptphys; /* phys addr of start of * kernel bootstrap tables */ static vm_paddr_t boot_ptendphys; /* phys addr of end of kernel * bootstrap page tables */ static size_t tsz; /* mmu_map.h opaque cookie size */ static uintptr_t (*ptmb_mappedalloc)(void) = NULL; static void (*ptmb_mappedfree)(uintptr_t) = NULL; static uintptr_t (*ptmb_ptov)(vm_paddr_t) = NULL; static vm_paddr_t (*ptmb_vtop)(uintptr_t) = NULL; extern uint64_t xenstack; /* The stack Xen gives us at boot */ extern char *console_page; /* The shared ring for console i/o */ extern struct xenstore_domain_interface *xen_store; /* xenstore page */ extern vm_map_t pv_map; vm_offset_t pv_minva, pv_maxva; /* Index offset into a pagetable, for a given va */ static int pt_index(uintptr_t va) { return ((va & PDRMASK) >> PAGE_SHIFT); } /* return kernel virtual address of 'n' claimed physical pages at boot. */ static uintptr_t vallocpages(vm_paddr_t *firstaddr, int n) { uintptr_t ret = *firstaddr + KERNBASE; bzero((void *)ret, n * PAGE_SIZE); *firstaddr += n * PAGE_SIZE; /* Make sure we are still inside of available mapped va. */ if (PTOV(*firstaddr) > (xenstack + 512 * 1024)) { printk("Attempt to use unmapped va\n"); } KASSERT(PTOV(*firstaddr) <= (xenstack + 512 * 1024), ("Attempt to use unmapped va\n")); return (ret); } /* * At boot, xen guarantees us a 512kB padding area that is passed to * us. We must be careful not to spill the tables we create here * beyond this. */ /* Set page addressed by va to r/o */ static void pmap_xen_setpages_ro(uintptr_t va, vm_size_t npages) { vm_size_t i; for (i = 0; i < npages; i++) { PT_SET_MA(va + PAGE_SIZE * i, phystomach(ptmb_vtop(va + PAGE_SIZE * i)) | PG_U | PG_V); } } /* Set page addressed by va to r/w */ static void pmap_xen_setpages_rw(uintptr_t va, vm_size_t npages) { vm_size_t i; for (i = 0; i < npages; i++) { PT_SET_MA(va + PAGE_SIZE * i, phystomach(ptmb_vtop(va + PAGE_SIZE * i)) | PG_U | PG_V | PG_RW); } } extern int etext; /* End of kernel text (virtual address) */ extern int end; /* End of kernel binary (virtual address) */ /* Return pte flags according to kernel va access restrictions */ static pt_entry_t pmap_xen_kernel_vaflags(uintptr_t va) { if ((va > (uintptr_t) &etext && /* .data, .bss et. al */ (va < (uintptr_t) &end)) || ((va > (uintptr_t)(xen_start_info->pt_base + xen_start_info->nr_pt_frames * PAGE_SIZE)) && va < PTOV(boot_ptphys)) || va > PTOV(boot_ptendphys)) { return PG_RW; } return 0; } uintptr_t tmpva; static void create_boot_pagetables(vm_paddr_t *firstaddr) { int i; int nkpdpe; int nkmapped = atop(VTOP(xenstack + 512 * 1024 + PAGE_SIZE)); kernel_vm_end = PTOV(ptoa(nkmapped - 1)); boot_ptphys = *firstaddr; /* lowest available r/w area */ /* Allocate pseudo-physical pages for kernel page tables. */ nkpt = howmany(nkmapped, NPTEPG); nkpdpe = howmany(nkpt, NPDEPG); KPML4phys = vallocpages(firstaddr, 1); KPDPphys = vallocpages(firstaddr, NKPML4E); KPDphys = vallocpages(firstaddr, nkpdpe); KPTphys = vallocpages(firstaddr, nkpt); #ifdef DMAPSUPPORT int ndm1g; ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; DMPDPphys = vallocpages(firstaddr, NDMPML4E); ndm1g = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) ndm1g = ptoa(Maxmem) >> PDPSHIFT; if (ndm1g < ndmpdp) DMPDphys = vallocpages(firstaddr, ndmpdp - ndm1g); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; #endif /* DMAPSUPPORT */ #ifdef DMAP4KSUPPORT int ndmapped = physmem; int ndmpt = howmany(ndmapped, NPTEPG); int ndmpdpe = howmany(ndmpt, NPDEPG); tmpva = ptoa(ndmapped); DMPDPphys = vallocpages(firstaddr, NDMPML4E); DMPDphys = vallocpages(firstaddr, ndmpdpe); DMPTphys = vallocpages(firstaddr, ndmpt); for (i = 0;ptoa(i) < ptoa(ndmapped); i++) { ((pt_entry_t *)DMPTphys)[i] = phystomach(i << PAGE_SHIFT); ((pt_entry_t *)DMPTphys)[i] |= PG_V | PG_U; } pmap_xen_setpages_ro(DMPTphys, (i - 1) / NPTEPG + 1); for (i = 0; i < ndmpt; i ++) { ((pd_entry_t *)DMPDphys)[i] = phystomach(VTOP(DMPTphys) + (i << PAGE_SHIFT)); ((pd_entry_t *)DMPDphys)[i] |= PG_V | PG_U; } pmap_xen_setpages_ro(DMPDphys, (ndmpt - 1) / NPDEPG + 1); for (i = 0; i < ndmpdpe; i++) { ((pdp_entry_t *)DMPDPphys)[i] = phystomach(VTOP(DMPDphys) + (i << PAGE_SHIFT)); ((pdp_entry_t *)DMPDPphys)[i] |= PG_V | PG_U; } pmap_xen_setpages_ro(DMPDPphys, (ndmpdpe - 1) / NPDPEPG + 1); /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < NDMPML4E - 1; i++) { ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = phystomach(VTOP(DMPDPphys) + (i << PAGE_SHIFT)); ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_V | PG_U; } #endif /* DMAP4KSUPPORT */ boot_ptendphys = *firstaddr - 1; /* We can't spill over beyond the 512kB padding */ KASSERT(((boot_ptendphys - boot_ptphys) / 1024) <= 512, ("bootstrap mapped memory insufficient.\n")); /* Fill in the underlying page table pages */ for (i = 0; ptoa(i) < ptoa(nkmapped); i++) { ((pt_entry_t *)KPTphys)[i] = phystomach(i << PAGE_SHIFT); ((pt_entry_t *)KPTphys)[i] |= PG_V | PG_U; ((pt_entry_t *)KPTphys)[i] |= pmap_xen_kernel_vaflags(PTOV(i << PAGE_SHIFT)); } pmap_xen_setpages_ro(KPTphys, (i - 1)/ NPTEPG + 1); /* Now map the page tables at their location within PTmap */ for (i = 0; i < nkpt; i++) { ((pd_entry_t *)KPDphys)[i] = phystomach(VTOP(KPTphys) + (i << PAGE_SHIFT)); ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_U; } #ifdef SUPERPAGESUPPORT /* XXX: work out r/o overlaps and 2M machine pages*/ /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { ((pd_entry_t *)KPDphys)[i] = phystomach(i << PDRSHIFT); ((pd_entry_t *)KPDphys)[i] |= PG_U | PG_RW | PG_V | PG_PS | PG_G; } #endif pmap_xen_setpages_ro(KPDphys, (nkpt - 1) / NPDEPG + 1); /* And connect up the PD to the PDP */ for (i = 0; i < nkpdpe; i++) { ((pdp_entry_t *)KPDPphys)[i + KPDPI] = phystomach(VTOP(KPDphys) + (i << PAGE_SHIFT)); ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; } pmap_xen_setpages_ro(KPDPphys, (nkpdpe - 1) / NPDPEPG + 1); #ifdef DMAPSUPPORT int j; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If * the end of physical memory is not aligned to a 1GB page boundary, * then the residual physical memory is mapped with 2MB pages. Later, * if pmap_mapdev{_attr}() uses the direct map for non-write-back * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings * that are partially used. */ for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { if ((i << PDRSHIFT) > ptoa(Maxmem)) { /* * Since the page is zeroed out at * vallocpages(), the remaining ptes will be * invalid. */ break; } ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)(phystomach(i << PDRSHIFT)); /* Preset PG_M and PG_A because demotion expects it. */ ((pd_entry_t *)DMPDphys)[j] |= PG_U | PG_V | PG_PS /* | PG_G */ | PG_M | PG_A; } /* Mark pages R/O */ pmap_xen_setpages_ro(DMPDphys, ndmpdp - ndm1g); /* Setup 1G pages, if available */ for (i = 0; i < ndm1g; i++) { if ((i << PDPSHIFT) > ptoa(Maxmem)) { /* * Since the page is zeroed out at * vallocpages(), the remaining ptes will be * invalid. */ break; } ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)phystomach(i << PDPSHIFT); /* Preset PG_M and PG_A because demotion expects it. */ ((pdp_entry_t *)DMPDPphys)[i] |= PG_U | PG_V | PG_PS | PG_G | PG_M | PG_A; } for (j = 0; i < ndmpdp; i++, j++) { ((pdp_entry_t *)DMPDPphys)[i] = phystomach(VTOP(DMPDphys) + (j << PAGE_SHIFT)); ((pdp_entry_t *)DMPDPphys)[i] |= PG_V | PG_U; } pmap_xen_setpages_ro(DMPDPphys, NDMPML4E); /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < NDMPML4E; i++) { ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = phystomach(VTOP(DMPDPphys) + (i << PAGE_SHIFT)); ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_V | PG_U; } #endif /* DMAPSUPPORT */ /* And recursively map PML4 to itself in order to get PTmap */ ((pdp_entry_t *)KPML4phys)[PML4PML4I] = phystomach(VTOP(KPML4phys)); ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_V | PG_U; /* Connect the KVA slot up to the PML4 */ ((pdp_entry_t *)KPML4phys)[KPML4I] = phystomach(VTOP(KPDPphys)); ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; pmap_xen_setpages_ro(KPML4phys, 1); xen_pgdir_pin(phystomach(VTOP(KPML4phys))); } /* * * Map in the xen provided share page. Note: The console page is * mapped in later in the boot process, when kmem_alloc*() is * available. */ static void pmap_xen_bootpages(vm_paddr_t *firstaddr) { uintptr_t va; vm_paddr_t ma; /* i) Share info */ ma = xen_start_info->shared_info; /* This is a bit of a hack right now - we waste a physical * page by overwriting its original mapping to point to * the page we want ( thereby losing access to the * original page ). * * The clean solution would have been to map it in at * KERNBASE + pa, where pa is the "pseudo-physical" address of * the shared page that xen gives us. We can't seem to be able * to use the pseudo-physical address in this way because the * linear mapped virtual address seems to be outside of the * range of PTEs that we have available during bootup (ptes * take virtual address space which is limited to under * (512KB - (kernal binaries, stack et al.)) during xen * bootup). */ va = vallocpages(firstaddr, 1); PT_SET_MA(va, ma | PG_RW | PG_V | PG_U); HYPERVISOR_shared_info = (void *) va; #if 0 /* ii) Userland page table base */ va = vallocpages(firstaddr, 1); bzero((void *)va, PAGE_SIZE); /* * x86_64 has 2 privilege rings and Xen keeps separate pml4 * pointers for each, which are sanity checked on every * exit via hypervisor_iret. We therefore set up a zeroed out * user page table pml4 to satisfy/fool xen. */ /* Mark the page r/o before pinning */ pmap_xen_setpages_ro(va, 1); /* Pin the table */ xen_pgdir_pin(phystomach(VTOP(va))); /* Register user page table with Xen */ xen_pt_user_switch(xpmap_ptom(VTOP(va))); #endif } /* Boot time ptov - xen guarantees bootpages to be offset */ static uintptr_t boot_ptov(vm_paddr_t p) { return PTOV(p); } /* Boot time vtop - xen guarantees bootpages to be offset */ static vm_paddr_t boot_vtop(uintptr_t v) { return VTOP(v); } /* alloc from linear mapped boot time virtual address space */ static uintptr_t mmu_alloc(void) { uintptr_t va; KASSERT(physfree != 0, ("physfree must have been set before using mmu_alloc")); va = vallocpages(&physfree, atop(PAGE_SIZE)); /* * Xen requires the page table hierarchy to be R/O. */ pmap_xen_setpages_ro(va, atop(PAGE_SIZE)); return va; } void pmap_bootstrap(vm_paddr_t *firstaddr) { /* setup mmu_map backend function pointers for boot */ ptmb_mappedalloc = mmu_alloc; ptmb_mappedfree = NULL; ptmb_ptov = boot_ptov; ptmb_vtop = boot_vtop; create_boot_pagetables(firstaddr); /* Switch to the new kernel tables */ xen_pt_switch(xpmap_ptom(VTOP(KPML4phys))); /* Unpin old page table hierarchy, and mark all its pages r/w */ xen_pgdir_unpin(phystomach(VTOP(xen_start_info->pt_base))); pmap_xen_setpages_rw(xen_start_info->pt_base, xen_start_info->nr_pt_frames); /* * gc newly free pages (bootstrap PTs and bootstrap stack, * mostly, I think.). * Record the pages as available to the VM via phys_avail[] */ /* This is the first free phys segment. see: xen.h */ KASSERT(pa_index == 0, ("reclaimed page table pages are not the lowest available!")); dump_avail[pa_index + 1] = phys_avail[pa_index] = VTOP(xen_start_info->pt_base); dump_avail[pa_index + 2] = phys_avail[pa_index + 1] = phys_avail[pa_index] + ptoa(xen_start_info->nr_pt_frames); pa_index += 2; /* Map in Xen related pages into VA space */ pmap_xen_bootpages(firstaddr); /* * Xen guarantees mapped virtual addresses at boot time upto * xenstack + 512KB. We want to use these for vallocpages() * and therefore don't want to touch these mappings since * they're scarce resources. Move along to the end of * guaranteed mapping. * * Note: Xen *may* provide mappings upto xenstack + 2MB, but * this is not guaranteed. We therefore assum that only 512KB * is available. */ virtual_avail = (uintptr_t) xenstack + 512 * 1024; /* XXX: Check we don't overlap xen pgdir entries. */ virtual_end = VM_MAX_KERNEL_ADDRESS - PAGE_SIZE; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)KPML4phys; kernel_pmap->pm_root.rt_root = 0; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ pmap_pv_init(); tsz = mmu_map_t_size(); /* Steal some memory (backing physical pages, and kva) */ physmem -= atop(round_page(msgbufsize)); msgbufp = (void *) pmap_map(&virtual_avail, ptoa(physmem), ptoa(physmem) + round_page(msgbufsize), VM_PROT_READ | VM_PROT_WRITE); bzero(msgbufp, round_page(msgbufsize)); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { pmap_pv_vm_page_init(m); } /* * Map in backing memory from kernel_vm_end to addr, * and update kernel_vm_end. */ void pmap_growkernel(uintptr_t addr) { KASSERT(kernel_vm_end < addr, ("trying to shrink kernel VA!")); addr = trunc_page(addr); char tbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t tptr = tbuf; struct mmu_map_mbackend mb = { ptmb_mappedalloc, ptmb_mappedfree, ptmb_ptov, ptmb_vtop }; mmu_map_t_init(tptr, &mb); for (;addr <= kernel_vm_end;addr += PAGE_SIZE) { if (mmu_map_inspect_va(kernel_pmap, tptr, addr)) { continue; } int pflags = VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED; vm_page_t m = vm_page_alloc(NULL, 0, pflags); KASSERT(m != NULL, ("Backing page alloc failed!")); vm_paddr_t pa = VM_PAGE_TO_PHYS(m); pmap_kenter(addr, pa); } mmu_map_t_fini(tptr); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { uintptr_t va; /* XXX: review the use of gdtset for the purpose below */ /* Get a va for console and map the console mfn into it */ vm_paddr_t ma = xen_start_info->console.domU.mfn << PAGE_SHIFT; va = kmem_alloc_nofault(kernel_map, PAGE_SIZE); KASSERT(va != 0, ("Could not allocate KVA for console page!\n")); pmap_kenter_ma(va, ma); console_page = (void *)va; /* Get a va for the xenstore shared page */ ma = xen_start_info->store_mfn << PAGE_SHIFT; va = kmem_alloc_nofault(kernel_map, PAGE_SIZE); KASSERT(va != 0, ("Could not allocate KVA for xenstore page!\n")); pmap_kenter_ma(va, ma); xen_store = (void *)va; /* Reserve pv VA space by allocating a submap */ KASSERT(kernel_map != 0, ("Initialising kernel submap before kernel_map!")); pv_map = kmem_suballoc(kernel_map, &pv_minva, &pv_maxva, sizeof(struct pv_chunk) * 100 /* XXX: Totally arbitrary */, 0); KASSERT(pv_map != NULL, ("Could not allocate kernel submap for pv_map!")); gdtset = 1; /* xpq may assert for locking sanity from this point onwards */ } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (void *) KPML4phys; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); pmap_pv_pmap_init(pmap); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } int pmap_pinit(pmap_t pmap) { KASSERT(pmap != kernel_pmap, ("kernel map re-initialised!", __func__)); PMAP_LOCK_INIT(pmap); /* * allocate the page directory page */ pmap->pm_pml4 = (void *) kmem_alloc(kernel_map, PAGE_SIZE); bzero(pmap->pm_pml4, PAGE_SIZE); /* * We do not wire in kernel space, or the self-referencial * entry in userspace pmaps becase both kernel and userland * share ring3 privilege. The user/kernel context switch is * arbitrated by the hypervisor by means of pre-loaded values * for kernel and user %cr3. The userland parts of kernel VA * may be conditionally overlaid with the VA of curthread, * since the kernel occasionally needs to access userland * process VA space. */ pmap_xen_setpages_ro((uintptr_t)pmap->pm_pml4, 1); xen_pgdir_pin(pmap_kextract_ma((uintptr_t)pmap->pm_pml4)); pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); pmap_pv_pmap_init(pmap); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return 1; } void pmap_xen_userload(pmap_t pmap) { KASSERT(pmap != kernel_pmap, ("Kernel pmap requested on user load.\n")); int i; for (i = 0; i < NUPML4E; i++) { pml4_entry_t pml4e; pml4e = (pmap->pm_pml4[i]); PT_SET_VA_MA((pml4_entry_t *)KPML4phys + i, pml4e, false); } PT_UPDATES_FLUSH(); /* Tell xen about user pmap switch */ xen_pt_user_switch(vtomach(pmap->pm_pml4)); } void pmap_release(pmap_t pmap) { KASSERT(pmap != kernel_pmap, ("%s: kernel pmap released", __func__)); xen_pgdir_unpin(pmap_kextract_ma((uintptr_t)pmap->pm_pml4)); pmap_xen_setpages_rw((uintptr_t)pmap->pm_pml4, 1); bzero(pmap->pm_pml4, PAGE_SIZE); kmem_free(kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); PMAP_LOCK_DESTROY(pmap); } static pt_entry_t * pmap_vtopte_inspect(pmap_t pmap, uintptr_t va, void *addr) { KASSERT(addr != NULL, ("addr == NULL")); mmu_map_t tptr = *(mmu_map_t *)addr; pd_entry_t *pte; /* PTE address to return */ struct mmu_map_mbackend mb = { ptmb_mappedalloc, ptmb_mappedfree, ptmb_ptov, ptmb_vtop }; mmu_map_t_init(tptr, &mb); if (!mmu_map_inspect_va(pmap, tptr, va)) { return NULL; /* XXX: fix api, return some kind of #define */ } pte = mmu_map_pt(tptr); /* Read out PT from mmu state */ /* add VA offset */ pte += pt_index(va); return pte; } static pt_entry_t * vtopte_inspect(uintptr_t va, void *addr) { return pmap_vtopte_inspect(kernel_pmap, va, addr); } static pt_entry_t * pmap_vtopte_hold(pmap_t pmap, uintptr_t va, void *addr) { KASSERT(addr != NULL, ("addr == NULL")); mmu_map_t tptr = *(mmu_map_t *)addr; pd_entry_t *pte; /* PTE address to return */ struct mmu_map_mbackend mb = { ptmb_mappedalloc, ptmb_mappedfree, ptmb_ptov, ptmb_vtop }; mmu_map_t_init(tptr, &mb); if (!mmu_map_inspect_va(pmap, tptr, va)) { mmu_map_hold_va(pmap, tptr, va); /* PT hierarchy */ xen_flush_queue(); } pte = mmu_map_pt(tptr); /* Read out PT from mmu state */ /* add VA offset */ pte += pt_index(va); return pte; } pt_entry_t * vtopte_hold(uintptr_t va, void *addr) { return pmap_vtopte_hold(kernel_pmap, va, addr); } static void pmap_vtopte_release(pmap_t pmap, uintptr_t va, void *addr) { mmu_map_t tptr = *(mmu_map_t *)addr; mmu_map_release_va(pmap, tptr, va); mmu_map_t_fini(tptr); } void vtopte_release(uintptr_t va, void *addr) { pmap_vtopte_release(kernel_pmap, va, addr); } #ifdef SMP void pmap_lazyfix_action(void); void pmap_lazyfix_action(void) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); } #endif /* SMP */ static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count -= count; } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * XXX: TODO SMP. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { KASSERT(count > 0, ("count > 0")); KASSERT(sva == trunc_page(sva), ("sva not page aligned")); KASSERT(ma != NULL, ("ma != NULL")); vm_page_t m; vm_paddr_t pa; while (count--) { m = *ma++; pa = VM_PAGE_TO_PHYS(m); pmap_kenter(sva, pa); sva += PAGE_SIZE; } // XXX: TODO: pmap_invalidate_range(kernel_pmap, sva, sva + count * // PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { KASSERT(count > 0, ("count > 0")); KASSERT(sva == trunc_page(sva), ("sva not page aligned")); vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } xen_flush_queue(); // XXX: TODO: pmap_invalidate_range(kernel_pmap, sva, va); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, vm_prot_t prot, boolean_t wired) { pt_entry_t *pte; pt_entry_t newpte, origpte; vm_paddr_t opa, pa; vm_page_t mpte, om; va = trunc_page(va); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == 0) VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(VM_PAGE_TO_PHYS(m) != 0, ("VM_PAGE_TO_PHYS(m) == 0x%lx\n", VM_PAGE_TO_PHYS(m))); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(xpmap_ptom(pa) | PG_A | PG_V | PG_U); if ((access & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't")); if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if (wired) newpte |= PG_W; /* newpte |= pmap_cache_bits(m->md.pat_mode, 0); XXX */ mpte = NULL; PMAP_LOCK(pmap); /* * In the case that a page table page is not * resident, we are creating it here. */ KASSERT(tsz != 0, ("tsz != 0")); char tbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t tptr = tbuf; pte = pmap_vtopte_hold(pmap, va, &tptr); origpte = pte_load(pte); /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Has the physical page changed? */ opa = xpmap_mtop(origpte & PG_FRAME); if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0) { newpte |= PG_MANAGED; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { newpte |= PG_MANAGED; pmap_put_pv_entry(pmap, va, m); if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: { /* XXX: This is not atomic */ origpte = pte_load(pte); PT_SET_VA_MA(pte, newpte, true); /* Sync the kernel's view of the pmap */ if (pmap != kernel_pmap && PCPU_GET(curpmap) == pmap) { /* XXX: this can be optimised to a single entry update */ pmap_xen_userload(pmap); } } opa = xpmap_mtop(origpte & PG_FRAME); if (opa != pa) { if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); if (!pmap_free_pv_entry(pmap, va, om)) { panic("Unable to free pv entry!"); } if ((om->aflags & PGA_WRITEABLE) != 0 && !pmap_page_is_mapped(om) && (om->flags & PG_FICTITIOUS) != 0) vm_page_aflag_clear(om, PGA_WRITEABLE); } } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); } else { PT_SET_VA_MA(pte, newpte, true); /* Sync the kernel's view of the pmap */ if (pmap != kernel_pmap && PCPU_GET(curpmap) == pmap) { /* XXX: this can be optimised to a single entry update */ pmap_xen_userload(pmap); } } if (pmap != kernel_pmap) pmap_xen_userload(pmap); unchanged: pmap_vtopte_release(pmap, va, &tptr); PMAP_UNLOCK(pmap); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_pindex_t diff, psize; vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); pmap_enter(pmap, va, prot, m, prot, false); m = TAILQ_NEXT(m, listq); } } void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { /* RO and unwired */ prot = (prot & ~VM_PROT_WRITE) | VM_PROT_READ; /* XXX: do we care about "speed" ? */ pmap_enter(pmap, va, prot, m, prot, false); } void * pmap_kenter_temporary(vm_paddr_t pa, int i) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); return NULL; } void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, vm_offset_t va, pt_entry_t *ptq) { pt_entry_t oldpte; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); { /* XXX: there's no way to make this atomic ? */ oldpte = pte_load(ptq); if (oldpte & PG_FRAME) { /* Optimise */ PT_CLEAR_VA(ptq, TRUE); } } if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (oldpte & PG_MANAGED) { m = MACH_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (!pmap_free_pv_entry(pmap, va, m)) { panic("%s: pv 0x%lx: 0x%lx, unknown on managed page!", __func__, VM_PAGE_TO_PHYS(m), va); } if (!pmap_page_is_mapped(m) && (m->flags & PG_FICTITIOUS) == 0) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * We never remove the backing pages - that's the job of * mmu_map.[ch] */ return false; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte_load(pte) & PG_V) == 0) return; pmap_remove_pte(pmap, va, pte); pmap_invalidate_page(pmap, va); } void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { KASSERT(eva >= sva, ("End VA is lower than Start VA")); vm_offset_t va, va_next; pt_entry_t *pte; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; KASSERT(tsz != 0, ("tsz != 0")); char tbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t tptr = tbuf; struct mmu_map_mbackend mb = { ptmb_mappedalloc, ptmb_mappedfree, ptmb_ptov, ptmb_vtop }; mmu_map_t_init(tptr, &mb); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { if (!mmu_map_inspect_va(pmap, tptr, sva)) { goto out; } pte = mmu_map_pt(tptr) + pt_index(sva); pmap_remove_page(pmap, sva, pte); goto out; } for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; if (!mmu_map_inspect_va(pmap, tptr, sva)) { if (mmu_map_pdpt(tptr) == NULL) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) /* Overflow */ va_next = eva; continue; } if (mmu_map_pdt(tptr) == NULL) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) /* Overflow */ va_next = eva; continue; } if (mmu_map_pt(tptr) == NULL) { va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) /* Overflow */ va_next = eva; continue; } panic("%s: All backing tables non-NULL," "yet hierarchy can't be inspected at va = 0x%lx\n", __func__, sva); } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; va = va_next; for (pte = (mmu_map_pt(tptr) + pt_index(sva)); sva != va_next;pte++, sva += PAGE_SIZE) { if (pte_load(pte) == 0) { if (va != va_next) { pmap_invalidate_range(pmap, sva, va); va = va_next; } continue; } /* * XXX: PG_G is set on *user* entries unlike * native, where it is set on kernel entries */ if ((pte_load(pte) & PG_G) != 0) anyvalid = 1; else if (va == va_next) va = sva; pmap_remove_pte(pmap, sva, pte); } if (va != va_next) { pmap_invalidate_range(pmap, sva, va); } } out: if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); mmu_map_t_fini(tptr); } static bool pv_remove(pmap_t pmap, vm_offset_t va, vm_page_t m) { pt_entry_t *pte, tpte; char tbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t tptr = tbuf; KASSERT(m != NULL, ("%s: passed NULL page!", __func__)); PMAP_LOCK(pmap); pte = pmap_vtopte_inspect(pmap, va, &tptr); KASSERT(pte != NULL, ("pte has no backing page tables!")); tpte = pte_load(pte); PT_CLEAR_VA(pte, TRUE); if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); /* XXX: Tell mmu_xxx about backing page */ pmap_vtopte_release(pmap, va, &tptr); pmap_invalidate_page(pmap, va); PMAP_UNLOCK(pmap); return false; } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. */ void pmap_remove_all(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); pmap_pv_iterate(m, pv_remove); /* free pv entry from all pmaps */ pmap_pv_page_unmap(m); } vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { KASSERT(pmap == kernel_pmap, ("XXX: %s: TODO\n", __func__)); return pmap_kextract(va); } vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { - KASSERT(0, ("XXX: %s: TODO\n", __func__)); - return 0; + PMAP_LOCK(pmap); + + pt_entry_t *pte; + vm_page_t m; + vm_paddr_t ma, pa; + + m = NULL; + pa = ma = 0; + + /* Walk the PT hierarchy to get the ma */ + char tbuf[tsz]; /* Safe to do this on the stack since tsz is + * effectively const. + */ + + mmu_map_t tptr = tbuf; +retry: + pte = pmap_vtopte_inspect(pmap, va, &tptr); + + if (pte == NULL) { + goto nomapping; + } + + if ((pte_load(pte) & PG_V) == 0) { + goto nomapping; + } + + ma = pte_load(pte) & PG_FRAME; + + if (ma == 0) { + goto nomapping; + } + + if (((pte_load(pte) & PG_RW) || + prot & VM_PROT_WRITE) == 0) { + if (vm_page_pa_tryrelock(pmap, ma, &pa)) goto retry; + + m = MACH_TO_VM_PAGE(ma); + KASSERT(m != NULL, ("Invalid ma from pte")); + + vm_page_hold(m); + } + +nomapping: + pmap_vtopte_release(pmap, va, &tptr); + + PA_UNLOCK_COND(pa); + PMAP_UNLOCK(pmap); + + return m; } vm_paddr_t pmap_kextract(vm_offset_t va) { vm_paddr_t ma; ma = pmap_kextract_ma(va); KASSERT((ma & ~PAGE_MASK) != 0, ("%s: Unmapped va: 0x%lx \n", __func__, va)); return xpmap_mtop(ma); } vm_paddr_t pmap_kextract_ma(vm_offset_t va) { if (ISDMAPVA(va)) { return xpmap_ptom(DMAP_TO_PHYS(va)); } vm_paddr_t ma; /* Walk the PT hierarchy to get the ma */ char tbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t tptr = tbuf; struct mmu_map_mbackend mb = { ptmb_mappedalloc, ptmb_mappedfree, ptmb_ptov, ptmb_vtop }; mmu_map_t_init(tptr, &mb); if (!mmu_map_inspect_va(kernel_pmap, tptr, va)) { ma = 0; goto nomapping; } ma = mmu_map_pt(tptr)[pt_index(va)]; mmu_map_t_fini(tptr); nomapping: return (ma & PG_FRAME) | (va & PAGE_MASK); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_ma(va, xpmap_ptom(pa)); } void pmap_kenter_ma(vm_offset_t va, vm_paddr_t ma) { KASSERT(tsz != 0, ("tsz != 0")); char tbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t tptr = tbuf; struct mmu_map_mbackend mb = { ptmb_mappedalloc, ptmb_mappedfree, ptmb_ptov, ptmb_vtop }; mmu_map_t_init(tptr, &mb); if (!mmu_map_inspect_va(kernel_pmap, tptr, va)) { mmu_map_hold_va(kernel_pmap, tptr, va); /* PT hierarchy */ xen_flush_queue(); /* XXX: cleanup */ } /* Backing page tables are in place, let xen do the maths */ PT_SET_MA(va, ma | PG_RW | PG_V | PG_U); PT_UPDATES_FLUSH(); mmu_map_t_fini(tptr); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; char tbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t tptr = tbuf; pte = vtopte_inspect(va, &tptr); if (pte == NULL) { /* Mapping doesn't exist */ goto notmapped; } PT_CLEAR_VA(pte, TRUE); PT_UPDATES_FLUSH(); pmap_invalidate_page(kernel_pmap, va); notmapped: // XXX: vtopte_release(va, &tptr); mmu_map_t_fini(tptr); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; va = sva = *virt; CTR4(KTR_PMAP, "pmap_map: va=0x%x start=0x%jx end=0x%jx prot=0x%x", va, start, end, prot); while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } // XXX: pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { /* * XXX: TODO - ignore for now - we need to revisit this as * soon as kdb is up. */ } void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) invltlb(); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; /* XXX: TODO SMP */ sched_pin(); for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); sched_unpin(); } void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { pmap_invalidate_range(pmap, va, va + PAGE_SIZE); } void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { /* * Nothing to do - page table backing pages are the only pages * that are implicitly "wired". These are managed by uma(9). */ } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t va_next; if (dst_addr != src_addr) return; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } KASSERT(tsz != 0, ("tsz != 0")); char stbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ char dtbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t stptr = stbuf; mmu_map_t dtptr = dtbuf; struct mmu_map_mbackend mb = { ptmb_mappedalloc, ptmb_mappedfree, ptmb_ptov, ptmb_vtop }; mmu_map_t_init(stptr, &mb); mmu_map_t_init(dtptr, &mb); for (addr = src_addr; addr < end_addr; addr = va_next) { pt_entry_t *src_pte, *dst_pte; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); if (!mmu_map_inspect_va(src_pmap, stptr, addr)) { if (mmu_map_pdpt(stptr) == NULL) { va_next = (addr + NBPML4) & ~PML4MASK; if (va_next < addr) /* Overflow */ va_next = end_addr; continue; } if (mmu_map_pdt(stptr) == NULL) { va_next = (addr + NBPDP) & ~PDPMASK; if (va_next < addr) /* Overflow */ va_next = end_addr; continue; } if (mmu_map_pt(stptr) == NULL) { va_next = (addr + NBPDR) & ~PDRMASK; if (va_next < addr) va_next = end_addr; continue; } panic("%s: All backing tables non-NULL," "yet hierarchy can't be inspected at va = 0x%lx\n", __func__, addr); } va_next = (addr + NBPDR) & ~PDRMASK; if (va_next < addr) va_next = end_addr; src_pte = mmu_map_pt(stptr) + pt_index(addr); while (addr < va_next) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { if (!mmu_map_hold_va(dst_pmap, dtptr, addr)) { goto out; } dst_pte = mmu_map_pt(dtptr) + pt_index(addr); if (pte_load(dst_pte) == 0 && pmap_put_pv_entry(dst_pmap, addr, MACH_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ PT_SET_VA_MA(dst_pte, ptetemp & ~(PG_W | PG_M | PG_A), true); pmap_resident_count_inc(dst_pmap, 1); } else { goto out; } } addr += PAGE_SIZE; src_pte++; } } out: mmu_map_t_fini(dtptr); mmu_map_t_fini(stptr); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t ma_src, ma_dst; vm_offset_t va_src, va_dst; KASSERT(msrc != NULL && mdst != NULL, ("Invalid source or destination page!")); va_src = kmem_alloc_nofault(kernel_map, PAGE_SIZE * 2); va_dst = va_src + PAGE_SIZE; KASSERT(va_src != 0, ("Out of kernel virtual space!")); ma_src = VM_PAGE_TO_MACH(msrc); ma_dst = VM_PAGE_TO_MACH(mdst); pmap_kenter_ma(va_src, ma_src); pmap_kenter_ma(va_dst, ma_dst); pagecopy((void *)va_src, (void *)va_dst); pmap_kremove(va_src); pmap_kremove(va_dst); kmem_free(kernel_map, va_src, PAGE_SIZE * 2); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg, b_pg; vm_offset_t a_pg_offset, b_pg_offset; int cnt; - panic(__func__); + a_pg = kmem_alloc_nofault(kernel_map, PAGE_SIZE * 2); b_pg = a_pg + PAGE_SIZE; KASSERT(a_pg != 0, ("Out of kernel virtual space!")); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); pmap_kenter_ma(a_pg, VM_PAGE_TO_MACH(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)a_pg + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); pmap_kenter_ma(b_pg, VM_PAGE_TO_MACH(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)b_pg + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } pmap_kremove(a_pg); pmap_kremove(b_pg); kmem_free(kernel_map, a_pg, PAGE_SIZE * 2); } void pmap_zero_page(vm_page_t m) { pmap_kenter(zerova, VM_PAGE_TO_PHYS(m)); pagezero((void *)zerova); pmap_kremove(zerova); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { if (off == 0 && size == PAGE_SIZE) pmap_zero_page(m); else { pmap_kenter(zerova, VM_PAGE_TO_PHYS(m)); bzero((char *)zerova + off, size); pmap_kremove(zerova); } } void pmap_zero_page_idle(vm_page_t m) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int64_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif cr3 = pmap_kextract((vm_offset_t)pmap->pm_pml4); td->td_pcb->pcb_cr3 = cr3; if (__predict_false(pmap == kernel_pmap)) { xen_load_cr3(cr3); } else { pmap_xen_userload(pmap); } PCPU_SET(curpmap, pmap); critical_exit(); } /* This is subtly different from pv_remove(). We don't defer removal * of pv entries until the end. * XXX: the pmap_pv.[ch] needs review for performance and use cases. */ static bool pv_map_remove(pmap_t pmap, vm_offset_t va, vm_page_t m) { pt_entry_t *pte, tpte; char tbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t tptr = tbuf; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pte = pmap_vtopte_inspect(pmap, va, &tptr); KASSERT(pte != NULL, ("pte has no backing page tables!")); tpte = pte_load(pte); if ((tpte & PG_V) == 0) { panic("bad pte va %lx pte %lx", va, tpte); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { return false; /* Continue iteration */ } if (m == NULL) { m = MACH_TO_VM_PAGE(tpte & PG_FRAME); } PT_CLEAR_VA(pte, TRUE); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { vm_page_dirty(m); } /* XXX: Tell mmu_xxx about backing page */ pmap_vtopte_release(pmap, va, &tptr); pmap_resident_count_dec(pmap, 1); if (!pmap_free_pv_entry(pmap, va, m)) { /* XXX: This is very slow */ panic("%s: pv 0x%lx: 0x%lx, unknown on managed page!", __func__, VM_PAGE_TO_PHYS(m), va); } return false; } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } PMAP_LOCK(pmap); pmap_pv_iterate_map(pmap, pv_map_remove); pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); } #include static bool pv_dummy(pmap_t pmap, vm_offset_t va, vm_page_t m) { return true; /* stop at the first iteration */ } boolean_t pmap_page_is_mapped(vm_page_t m) { if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); return pmap_pv_iterate(m, pv_dummy); } boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); return 0; } int pmap_page_wired_mappings(vm_page_t m) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); return -1; } boolean_t pmap_is_modified(vm_page_t m) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); return 0; } boolean_t pmap_is_referenced(vm_page_t m) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); return 0; } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ /* * XXX: I've just duplicated what native does here. I *think*, with * mmu_map.[ch] (which native doesn't have), addr is always * prefaultable. Research this. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { boolean_t prefaultable = false; KASSERT(tsz != 0, ("tsz != 0")); char tbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t tptr = tbuf; struct mmu_map_mbackend mb = { ptmb_mappedalloc, ptmb_mappedfree, ptmb_ptov, ptmb_vtop }; mmu_map_t_init(tptr, &mb); PMAP_LOCK(pmap); prefaultable = mmu_map_inspect_va(pmap, tptr, addr); PMAP_UNLOCK(pmap); mmu_map_t_fini(tptr); return prefaultable; } void pmap_clear_modify(vm_page_t m) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); } void pmap_clear_reference(vm_page_t m) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); } /* Callback to remove write access on given va and pmap */ static bool pv_remove_write(pmap_t pmap, vm_offset_t va, vm_page_t m) { pt_entry_t oldpte, *pte; char tbuf[tsz]; /* Safe to do this on the stack since tsz is * effectively const. */ mmu_map_t tptr = tbuf; KASSERT(m != NULL, ("%s: passed NULL page!", __func__)); PMAP_LOCK(pmap); pte = pmap_vtopte_inspect(pmap, va, &tptr); KASSERT(pte != NULL, ("pte has no backing page tables!")); oldpte = pte_load(pte); if (oldpte & PG_RW) { PT_SET_MA(va, oldpte & ~(PG_RW | PG_M)); if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, va); } pmap_vtopte_release(pmap, va, &tptr); PMAP_UNLOCK(pmap); return false; /* Iterate through every mapping */ } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by * another thread while the object is locked. Thus, if PGA_WRITEABLE * is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if ((m->oflags & VPO_BUSY) == 0 && (m->aflags & PGA_WRITEABLE) == 0) return; pmap_pv_iterate(m, pv_remove_write); vm_page_aflag_clear(m, PGA_WRITEABLE); } int pmap_ts_referenced(vm_page_t m) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); return -1; } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } void pmap_suspend() { KASSERT(0, ("XXX: %s: TODO\n", __func__)); } void pmap_resume() { KASSERT(0, ("XXX: %s: TODO\n", __func__)); } int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); return -1; } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); return NULL; } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); } int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { KASSERT(0, ("XXX: %s: TODO\n", __func__)); return -1; } static uintptr_t xen_vm_ptov(vm_paddr_t pa) { /* Assert for valid PA *after* the VM has been init-ed */ KASSERT((gdtset == 1 || pa < physfree), ("Stray PA 0x%lx passed\n", pa)); return PHYS_TO_DMAP(pa); } static vm_paddr_t xen_vm_vtop(uintptr_t va) { if (ISBOOTVA(va) && gdtset != 1) { /* * Boot time page */ return VTOP(va); } if (ISDMAPVA(va)) { return DMAP_TO_PHYS(va); } if (ISKERNELVA(va)) { return pmap_kextract(va); } panic("Unknown VA 0x%lxpassed to %s\n", va, __func__); return 0; } static uintptr_t xen_pagezone_alloc(void) { vm_page_t m = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); return PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); } static void -xen_pagezone_free(vm_offset_t page) +xen_pagezone_free(vm_offset_t va) { - if (ISBOOTVA(page)) { + vm_paddr_t pa; + vm_page_t m; + + /* We only free va obtained from xen_pagezone_alloc() */ + KASSERT(!ISKERNELVA(va), ("Trying to free unknown va: 0x%lx ", va)); + + if (ISBOOTVA(va)) { /* We don't manage this range */ return; } - vm_page_free(PHYS_TO_VM_PAGE(page)); + if (ISDMAPVA(va)) { + pa = DMAP_TO_PHYS(va); + } + else { + panic("Unknown va: 0x%lx\n", va); + } + + m = PHYS_TO_VM_PAGE(pa); + + KASSERT(m != NULL, ("Trying to free unknown page")); + + vm_page_unwire(m, 0); + vm_page_free(m); return; } /* * Replace the custom mmu_alloc(), backed by vallocpages(), with an * uma backed allocator, as soon as it is possible. */ static void setup_xen_pagezone(void *dummy __unused) { ptmb_mappedalloc = xen_pagezone_alloc; ptmb_mappedfree = xen_pagezone_free; ptmb_vtop = xen_vm_vtop; ptmb_ptov = xen_vm_ptov; } SYSINIT(setup_xen_pagezone, SI_SUB_VM_CONF, SI_ORDER_ANY, setup_xen_pagezone, NULL);